

##### Mummolo and Nall, 2016
##### Data Management File



#########Note: This file cleans the raw data used in this study and generates data frames necessary for estimation and the creation of all tables and figures. Set the working directory to the "replication package" folder on your computer before each section of this code and then execute the code below. This will save all necessary data frames locally to the "replication package/data" folder. You can then move on to the "Estimation.R" file to reproduce all results.


#############################################
#############################################
#############################################
#############################################


##### CLEAN AND RECODE VARIABLES IN SSI SURVEY DATA


#############################################
#############################################
#############################################
#############################################


rm(list=ls())##clear workspace

library(xlsx)
library(foreign)
library(stringr)
library(xlsx)
library(sandwich)
library(lmtest)
library(rms)
library(survey)
library(xtable)


as.n<-as.numeric
as.c<-as.character
as.f<-as.formula
len<-length

jmwd.2<-"/Users/jonathanmummolo/Dropbox/geogaffect/code/SSI.130604/replication package/"
setwd(jmwd.2)
wd<-"/Users/jonathanmummolo/Dropbox/geogaffect/code/SSI.130604/replication package/"




##read in raw survey data from SSI
dd<-read.csv("data/SSI_Version_130604.csv", header=T, as.is=T)

###remove unwanted data
dd<-dd[-grep("ResponseID",dd), ] ##remove row of labels
##next code removes all the observations generated by admins prior to launch of survey
dd<-dd[!dd$psid == "cristine234", ]
dd<-dd[!dd$psid == "test465869696707", ]
dd<-dd[!dd$psid == "testing123", ]
dd<-dd[!dd$psid == "termtest123", ]
dd<-dd[!dd$psid == "Keelertest123", ]
dd<-dd[!dd$psid == "CristineKeeler123", ]
dd<-dd[!dd$psid == "CrissK123", ]
dd<-dd[!dd$psid == "CrissK123", ]

dim(dd)

# Recoding scheme for variables of interest
recoder<-read.xlsx("data/Recode_Final_SSI_130604.xlsx", 1)


recoder$newvarname<-as.character(recoder$newvarname)


  # Recode values according to Excel spreadhseet
  vals<-recoder[,grep("val", names(recoder))]
  vals<-apply(vals, 2, as.c)
  
  for(i in 1:nrow(recoder)){##for each row in the excel coding document (i.e. each variable)...
  	
  	##replace variable names in the data
    if(!is.na(recoder$newvarname[i])) names(dd)[i]<-recoder$newvarname[i]
  }
  
  ##new loop to assign values to variables
  for(i in 1:nrow(recoder)){##for each row in the excel coding document (i.e. each variable)...
   
    if(!is.na(recoder$newvarname[i])& sum(!is.na(vals[i,]))>0){ ## if new values are in the spreadsheet...
      
      for(j in 1:sum(!is.na(vals[i,]))){ #for each new value to be assigned for this variable...
        
        dd[,recoder$newvarname[i]][dd[,recoder$newvarname[i]]==j]<-vals[i,j]##replace each numeric level of variable with corresponding new value
      
      }

    }
  }
  
  
##drop three conjoint cases that were missing on question prompts but still registered responses

dim(dd)

trait.cols<-grep("Q3.L.F_", names(dd))

miss.conj<-dd$respid[dd[, trait.cols[1]]==""]

dd<-dd[dd$respid!=miss.conj[1],]
dd<-dd[dd$respid!=miss.conj[2],]
dd<-dd[dd$respid!=miss.conj[3],]
	
dim(dd)

##drop cases of people who answered more than one community/housing preference item, which should not have happened (this means other things may have gone wrong in the survey)

dd$comm.resp.test<-paste(dd$rp.will.prefa1,dd$rp.will.prefa2,dd$rp.will.prefa3, dd$rp.will.prefa4,dd$rp.will.prefa5,dd$rp.will.prefa6,dd$rp.will.prefa7,dd$rp.will.prefa8,dd$rp.will.prefa9, sep="") 

table(dd$comm.resp.test)

dd$comm1.double<-0
dd$comm1.double[dd$comm.resp.test=="01" |dd$comm.resp.test=="10"|dd$comm.resp.test=="11"]<-1
table(dd$comm1.double)##11 bad cases

dd$comm.resp.test[which(dd$comm1.double==1)]

dd<-subset(dd, dd$comm1.double!=1)
dim(dd)


dd$comm.resp2.test<-paste(dd$rp.attr.prefa1,dd$rp.attr.prefa2,dd$rp.attr.prefa3, dd$rp.attr.prefa4,dd$rp.attr.prefa5,dd$rp.attr.prefa6,dd$rp.attr.prefa7,dd$rp.attr.prefa8,dd$rp.attr.prefa9, sep="") 

table(dd$comm.resp2.test)##no double responses here


##same for housing pref items:

dd$house.resp.test<-paste(dd$hp.sat.prefa1,dd$hp.sat.prefa2,dd$hp.sat.prefa3, dd$hp.sat.prefa4,dd$hp.sat.prefa5,dd$hp.sat.prefa6,dd$hp.sat.prefa7,dd$hp.sat.prefa8, sep="") 

table(dd$house.resp.test) ##no doubles here

dd$house.resp2.test<-paste(dd$hp.attr.prefa1,dd$hp.attr.prefa2,dd$hp.attr.prefa3, dd$hp.attr.prefa4,dd$hp.attr.prefa5,dd$hp.attr.prefa6,dd$hp.attr.prefa7,dd$hp.attr.prefa8, sep="") 

table(dd$house.resp2.test)##no doubles here
dim(dd)

####
## PARTY coding (combine leaners into their respective parties in new var)
####
#dd$party
table(dd$party, exclude=NULL)
table(dd$partylean, exclude=NULL)
dd$partywlean<-dd$party
###now add the leaners
dd$partywlean[dd$party%in%c("ind", "oth")&!is.na(dd$party)]<-dd$partylean[dd$party%in%c("ind", "oth")&!is.na(dd$party)]##replace those who answered ind. or other on first question with whatever the answer to the second "leaner" question is
table(dd$party)
table(dd$partylean)
table(dd$partywlean)



##generate DEMOCRAT DUMMY
table(dd$partywlean, exclude=NULL)
dd$democrat<-NA
dd$democrat[dd$partywlean=="dem"]<-1
dd$democrat[dd$partywlean!="dem" & !is.na(dd$partywlean) & dd$partywlean!=""]<-0
table(dd$democrat)

##REPUBLICAN DUMMY
table(dd$partywlean)
dd$republican<-NA
dd$republican[dd$partywlean=="rep"]<-1
dd$republican[dd$partywlean!="rep" & !is.na(dd$partywlean)& dd$partywlean!=""]<-0
table(dd$republican)

##dummies exluding leaners
table(dd$party)
table(dd$partylean)
dd$democrat2<-NA
dd$democrat2[dd$party=="dem"]<-1
dd$democrat2[dd$party=="rep"]<-0
table(dd$democrat2)


dd$republican2<-NA
dd$republican2[dd$party=="rep"]<-1
dd$republican2[dd$party=="dem"]<-0
table(dd$republican2)




####
#####
###Code up total response time
#####
#####


##calculates total time
dd$stop <- strptime(dd$stop, "%Y-%m-%d %H:%M:%OS")
dd$start <- strptime(dd$start, "%Y-%m-%d %H:%M:%OS")
dd$time<-(dd$stop-dd$start)/60
dd$time<-as.numeric(dd$time)
summary(dd$time)


##generate speeder dummy var
##median total time = 12.88 mins
dd$speeder<-NA
dd$speeder[dd$time>median(dd$time, na.rm=TRUE)]<-1
dd$speeder[dd$time<=median(dd$time, na.rm=TRUE)]<-0
table(dd$speeder)

##generate response time threshholds

dd$time5<-0
dd$time5[dd$time<5]<-1 
dd$time6<-0
dd$time6[dd$time>=5 & dd$time<6 ]<-1 
dd$time7<-0
dd$time7[dd$time>=6 & dd$time<7 ]<-1 
dd$time8<-0
dd$time8[dd$time>=7 & dd$time<8 ]<-1 
dd$time9<-0
dd$time9[dd$time>=8 & dd$time<9 ]<-1 
dd$time10<-0
dd$time10[dd$time>=9 & dd$time<10 ]<-1 
dd$time11<-0
dd$time11[dd$time>=10 & dd$time<11 ]<-1 
dd$time12<-0
dd$time12[dd$time>=11 & dd$time<12 ]<-1 
dd$time13<-0
dd$time13[dd$time>=12 & dd$time<13 ]<-1 
dd$time14<-0
dd$time14[dd$time>=13 & dd$time<14 ]<-1 
dd$time15<-0
dd$time15[dd$time>=14 & dd$time<15 ]<-1 
dd$time16<-0
dd$time16[dd$time>15 ]<-1 

time.freq<-as.data.frame(rbind(sum(dd$time5 ), sum(dd$time6), sum(dd$time7), sum(dd$time8),sum(dd$time9), sum(dd$time10), sum(dd$time11), sum(dd$time12), sum(dd$time13), sum(dd$time14), sum(dd$time15), sum(dd$time16)))
rownames(time.freq)<-c("<5min", ">=5 & <6 mins.",">=6 & <7 mins.",">=7 & <8 mins.",">=8 & <9 mins.",">=9 & <10 mins.",">=10 & <11 mins.",">=11 & <12 mins.",">=12 & <13 mins.",">=13 & <14 mins.", ">=14 & <15 mins.", ">15 mins")
time.freq$prop<-time.freq[,1]/dim(dd)[1] ##generate proportions of speeders
colnames(time.freq)<-c("Frequency", "Proportion")
time.freq



###generate single factor variable to represent time category
dd$time.level<-NA
dd$time.level[dd$time5==1]<-"<5 Mins."
dd$time.level[dd$time6==1]<-">=5 & <6 Mins."
dd$time.level[dd$time7==1]<-">=6 & <7 Mins."
dd$time.level[dd$time8==1]<-">=7 & <8 Mins."
dd$time.level[dd$time9==1]<-">=8 & <9 Mins."
dd$time.level[dd$time10==1]<-">=9 & <10 Mins."
dd$time.level[dd$time11==1]<-">=10 & <11 Mins."
dd$time.level[dd$time12==1]<-">=11 & <12 Mins."
dd$time.level[dd$time13==1]<-">=12 & <13 Mins."
dd$time.level[dd$time14==1]<-">=13 & <14 Mins."
dd$time.level[dd$time15==1]<-">=14 & <15 Mins."

dd$time.level<-as.factor(as.character(dd$time.level))
table(dd$time.level)

##make numeric version of this variable
dd$time.level2<-NA
dd$time.level2[dd$time5==1]<-5
dd$time.level2[dd$time6==1]<-6
dd$time.level2[dd$time7==1]<-7
dd$time.level2[dd$time8==1]<-8
dd$time.level2[dd$time9==1]<-9
dd$time.level2[dd$time10==1]<-10
dd$time.level2[dd$time11==1]<-11
dd$time.level2[dd$time12==1]<-12
dd$time.level2[dd$time13==1]<-13
dd$time.level2[dd$time14==1]<-14
dd$time.level2[dd$time15==1]<-15
dd$time.level2[dd$time16==1]<-16
table(dd$time.level2)




##own/rent
##for some reason the recoder does not get this one right. the values are also weird in the qualtrics data. instead of the first response being coded as 1 it is coded as 4. it then goes 5,7,6 for the next three. manually recode these below

dd$ownrent[dd$ownrent=="4"]<-"own"
dd$ownrent[dd$ownrent=="5"]<-"rent"
dd$ownrent[dd$ownrent=="6"]<-"other"
dd$ownrent[dd$ownrent=="7"]<-"neither"
table(dd$ownrent)

dd$ownrent2<-NA
dd$ownrent2[dd$ownrent=="own"]<-1
dd$ownrent2[dd$ownrent=="rent"]<-0
table(dd$ownrent2)

##income
table(dd$income)


##numeric income, based on category midpoints
dd$income.num<-NA
dd$income.num[dd$income=="1.30kless"]<-(30/2)
dd$income.num[dd$income=="10.120kplus"]<-(220+120)/2##top is arbitrary
dd$income.num[dd$income=="2.30to39k"]<-(30+39)/2
dd$income.num[dd$income=="3.40to49k"]<-(40+49)/2
dd$income.num[dd$income=="4.50to59k"]<-(50+59)/2
dd$income.num[dd$income=="5.60to69k"]<-(60+69)/2
dd$income.num[dd$income=="6.70to79k"]<-(70+79)/2
dd$income.num[dd$income=="7.80to89k"]<-(80+89)/2
dd$income.num[dd$income=="8.90to99k"]<-(90+99)/2
dd$income.num[dd$income=="9.100to119k"]<-(100+119)/2
summary(dd$income.num)
table(dd$income.num)


##rich

##U.S. Census has the top 10% of households > than $135k and top 25% > $85k. For now I'll code "rich" as > $90k a year
# source: https://en.wikipedia.org/wiki/File:Distribution_of_Annual_Household_Income_in_the_United_States.png

dd$rich<-NA
dd$rich[dd$income=="7.80to89k"|dd$income=="8.90to99k"|dd$income=="9.100to119k"|dd$income=="10.120kplus"]<-1
dd$rich[dd$income=="1.30kless"|dd$income=="2.30to39k"|dd$income=="3.40to49k"|dd$income=="4.50to59k"|dd$income=="5.60to69k"|dd$income=="6.70to79k"]<-0
table(dd$rich)

##three-level income scheme
#inc3: 3-category income variable
#high-income (>=100k)
#middle-income (>40k and <100k)
#"low" income (<40k)
summary(dd$income.num)
dd$inc3<-NA
dd$inc3[dd$income.num>100]<-"high.inc"
dd$inc3[dd$income.num>=40 & dd$income.num<=100]<-"middle.inc"
dd$inc3[dd$income.num<40]<-"low.inc"
table(dd$inc3)

##employment status

table(dd$employed, exclude=NULL)
names(table(dd$employed, exclude=NULL))

not.work<-c("full-time homemaker"  ,   "not employed"      , "other paid employment" ,  "part-time paid", "part-time self-employed"   )
work<-c("full-time paid" , "full-time self-employed",   "occasional employment" ,  "retired"  )
work2<-c("full-time paid" , "full-time self-employed",   "occasional employment" )

dd$working<-NA
dd$working[dd$employed%in%work]<-1
dd$working[dd$employed%in%not.work]<-0
table(dd$working)
mean(dd$working, na.rm=T)##63% are employed in some way

dd$retired<-NA
dd$retired[dd$employed%in%c("retired")]<-1
dd$retired[dd$employed%in%not.work|dd$employed%in%work2]<-0 ##in any other non missing category
table(dd$retired)
mean(dd$retired, na.rm=T)##30% are retired




##kids at home?
dd$kidshome<-as.n(as.c(dd$kidshome))
table(dd$kidshome)

##city v non-city residents
table(dd$nbhdescrip)
dd$city<-NA
dd$city[dd$nbhdescrip=="1.citydwntwn"|dd$nbhdescrip=="2.cityresid"]<-1
dd$city[dd$nbhdescrip=="3.suburbdtwn"|dd$nbhdescrip=="4.suburbresid"|dd$nbhdescrip=="5.smalltown"|dd$nbhdescrip=="5.smalltown"|dd$nbhdescrip=="6.ruralfarm"|dd$nbhdescrip=="7.ruralnofarm"]<-0
table(dd$city)

##feeling therm toward races
dd$thrm.hispanics<-as.n(as.c(dd$thrm.hispanics))
dd$thrm.blacks<-as.n(as.c(dd$thrm.blacks))
dd$thrm.whites<-as.n(as.c(dd$thrm.whites))
table(dd$thrm.hispanics)
table(dd$thrm.blacks)
table(dd$thrm.whites)

##other feeling thermometers
dd$thrm.obama<-as.n(as.c(dd$thrm.obama))
table(dd$thrm.obama)
dd$thrm.unions<-as.n(as.c(dd$thrm.unions))
table(dd$thrm.unions)
dd$thrm.execs<-as.n(as.c(dd$thrm.execs))
table(dd$thrm.execs)
dd$thrm.atheists<-as.n(as.c(dd$thrm.atheists))
table(dd$thrm.atheists)
dd$thrm.gop<-as.n(as.c(dd$thrm.gop))
table(dd$thrm.gop)
dd$thrm.dem<-as.n(as.c(dd$thrm.dem))
table(dd$thrm.dem)

dd$thrm.undoc<-as.n(as.c(dd$thrm.undoc))
table(dd$thrm.undoc)

dd$thrm.urbanites<-as.n(as.c(dd$thrm.urbanites))
table(dd$thrm.urbanites, exclude=NULL)

dd$thrm.suburbanites<-as.n(as.c(dd$thrm.suburbanites))
table(dd$thrm.suburbanites)

dd$thrm.rurals<-as.n(as.c(dd$thrm.rurals))
table(dd$thrm.rurals, exclude=NULL)

dd$thrm.enviros<-as.n(as.c(dd$thrm.enviros))
table(dd$thrm.enviros)

dd$thrm.pedestrians<-as.n(as.c(dd$thrm.pedestrians))
table(dd$thrm.pedestrians)

dd$thrm.cyclists<-as.n(as.c(dd$thrm.cyclists))
table(dd$thrm.cyclists)

dd$thrm.teachers<-as.n(as.c(dd$thrm.teachers))
table(dd$thrm.teachers)


##feeling therm toward blacks, split at median
dd$black.therm<-NA
dd$black.therm[dd$thrm.blacks>median(dd$thrm.blacks, na.rm=TRUE)]<-1
dd$black.therm[dd$thrm.blacks<=median(dd$thrm.blacks, na.rm=TRUE)]<-0
table(dd$black.therm)

#feeling therm toward blacks and hispanics
## first average the two, then split at median
dd$black.hisp.therm.mean<-(dd$thrm.blacks+dd$thrm.hispanics)/2
summary(dd$black.hisp.therm.mean)
dd$black.hisp.therm.mean[1:50]
dd$black.hisp.therm<-NA
dd$black.hisp.therm[dd$black.hisp.therm.mean>median(dd$black.hisp.therm.mean, na.rm=TRUE)]<-1
dd$black.hisp.therm[dd$black.hisp.therm.mean<=median(dd$black.hisp.therm.mean, na.rm=TRUE)]<-0
table(dd$black.hisp.therm)

##like politics
table(dd$likepolitics)
dd$likepol<-NA
dd$likepol[dd$likepolitics=="1.D"]<-1 ##does not like reading/talking about pol
dd$likepol[dd$likepolitics=="2.N"]<-2 ##neutral
dd$likepol[dd$likepolitics=="3.A"]<-3 ##likes politics
table(dd$likepol)

##did resp vote in 2012?
table(dd$vote2012)
dd$vote<-as.n(as.c(dd$vote2012))
table(dd$vote)


##difference between feeling therm for whites and blacks (positive value indicates preferences for whites)
dd$white.black.diff.score<-dd$thrm.whites- dd$thrm.blacks
summary(dd$white.black.diff.score)
dd$white.black.diff.score[1:50]
dd$white.black.diff<-NA
dd$white.black.diff[dd$white.black.diff.score>median(dd$white.black.diff.score, na.rm=TRUE)]<-1
dd$white.black.diff[dd$white.black.diff.score<=median(dd$white.black.diff.score, na.rm=TRUE)]<-0
table(dd$white.black.diff)

##native english speaker; make numeric
table(dd$nativeeng)
dd$nativeeng<-as.n(as.c(dd$nativeeng))
table(dd$nativeeng)

##pre-screen questions
table(dd$screen.pre.1)##1=True, 0=False...correct answer is FALSE
dd$pre.screen.1.c<-NA
dd$pre.screen.1.c[dd$screen.pre.1==0]<-1
dd$pre.screen.1.c[dd$screen.pre.1==1 & !is.na(dd$screen.pre.1)]<-0
table(dd$pre.screen.1.c)

table(dd$screen.pre.2)##1=True, 0=False...correct answer is FALSE
dd$pre.screen.2.c<-NA
dd$pre.screen.2.c[dd$screen.pre.2==0]<-1
dd$pre.screen.2.c[dd$screen.pre.2==1 & !is.na(dd$screen.pre.2)]<-0
table(dd$pre.screen.2.c)


##post screen questions
##code missing values as 0 (if they didnt respond, they failed)

##make first post screen question numeric
dd$post.screen1<-as.n(as.c(dd$post.screen1))
dd$post.screen1[is.na(dd$post.screen1)==TRUE]<-0
table(dd$post.screen1)

##generate second screen var
dd$post.screen2.new<-NA
dd$post.screen2.new[dd$post.screen2!=""]<-1
dd$post.screen2.new[dd$post.screen2==""]<-0
dd$post.screen2.new[is.na(dd$post.screen2)==TRUE]<-0

table(dd$post.screen2.new)


## recode #dd$modetowork which also has buggy vlaues
##mode to work questions also messed up
## no values 3's or 4's in data so cant confirm their labels
dd$modetowork[dd$modetowork=="1"]<-"alone.car"
dd$modetowork[dd$modetowork=="2"]<-"shared.car"
dd$modetowork[dd$modetowork=="5"]<-"motorcycle"
dd$modetowork[dd$modetowork=="6"]<-"bus/van"
dd$modetowork[dd$modetowork=="7"]<-"train"
dd$modetowork[dd$modetowork=="8"]<-"subway"
dd$modetowork[dd$modetowork=="9"]<-"bicycle"
dd$modetowork[dd$modetowork=="10"]<-"walking"
dd$modetowork[dd$modetowork=="11"]<-"other"

##mode to school questions also messed up
## no values 5's or 6's or 7's or 9's in data so cant confirm their labels

dd$modetoschool[dd$modetoschool=="1"]<-"alone.car"
dd$modetoschool[dd$modetoschool=="2"]<-"shared.car"
dd$modetoschool[dd$modetoschool=="3"]<-"bus"
dd$modetoschool[dd$modetoschool=="4"]<-"train"
dd$modetoschool[dd$modetoschool=="8"]<-"subway"
dd$modetoschool[dd$modetoschool=="10"]<-"walking"
dd$modetoschool[dd$modetoschool=="11"]<-"other"

##recode dd$student which is also buggy
dd$student[dd$student=="1"]<-"full.time"
dd$student[dd$student=="2"]<-"part.time.campus"
dd$student[dd$student=="3"]<-"no"
dd$student[dd$student=="5"]<-"online"


##recode commuteinwork
table(as.n(as.c(dd$commuteminwork)))
dd$commuteminwork[dd$commuteminwork=="05"]<-"5"
dd$commuteminwork[dd$commuteminwork=="1hr"]<-"60"
dd$commuteminwork[dd$commuteminwork=="30 "]<-"30"
dd$commuteminwork[dd$commuteminwork=="60 "]<-"60"
dd$commuteminwork[dd$commuteminwork=="3 m"]<-"3"
dd$commuteminwork[dd$commuteminwork=="ni"]<-NA
dd$commuteminwork[dd$commuteminwork=="QP"]<-NA
table(dd$commuteminwork)
table(as.n(as.c(dd$commuteminwork)))
dd$commuteminwork<-as.n(as.c(dd$commuteminwork))
table(dd$commuteminwork)
##recode commuteminschool
table(dd$commuteminschool)##values appear all valid


dim(dd)


dd$age<-18 + as.n(as.c(dd$birthyr)) -1
table(dd$age)

##make "OLD" variable
dd$old<-NA
dd$old[dd$age>65]<-1
dd$old[dd$age<=65]<-0
table(dd$old)
##gender
table(dd$gender)
dd$female<-NA
dd$female[dd$gender==2]<-1
dd$female[dd$gender==1]<-0
table(dd$female)

##make variables for recent college grad aged
dd$young1<-NA
dd$young1[dd$age<=25]<-1
dd$young1[dd$age>25 & !is.na(dd$age)]<-0
table(dd$young1)

dd$young2<-NA
dd$young2[dd$age<=30]<-1
dd$young2[dd$age>30 & !is.na(dd$age)]<-0
table(dd$young2)

dd$young3<-NA
dd$young3[dd$age<=35]<-1
dd$young3[dd$age>35 & !is.na(dd$age)]<-0
table(dd$young3)



##Education
table(dd$educ)
##recode to be BA+ / Non-BA+
dd$educ2<-NA
dd$educ2[dd$educ=="5colldeg4yr"|dd$educ=="6ma"|dd$educ=="7phd"|dd$educ=="8profdeg"]<-1
dd$educ2[dd$educ=="1hsless"|dd$educ=="2hsgrad"|dd$educ=="3somecol"|dd$educ=="4colldeg2yr"]<-0
table(dd$educ2)

##education in years (coded to sync to ANES 2012 which had fewer categories)
dd$educ.years<-NA
dd$educ.years[dd$educ=="1hsless"]<-8
dd$educ.years[dd$educ=="2hsgrad"]<-12
dd$educ.years[dd$educ=="3somecol"]<-13
dd$educ.years[dd$educ=="4colldeg2yr"]<-14
dd$educ.years[dd$educ=="5colldeg4yr"]<-16
dd$educ.years[dd$educ=="6ma"]<-18
dd$educ.years[dd$educ=="8profdeg"]<-19
dd$educ.years[dd$educ=="7phd"]<-21
table(dd$educ.years)

##martial status
table(dd$marital)
##recode into married or not
dd$married<-NA
dd$married[dd$marital=="married"]<-1
dd$married[dd$marital=="legal.dom.part"]<-1
dd$married[dd$marital=="divorced"| dd$marital=="other"|dd$marital=="separated"|dd$marital=="single.nevermar"|dd$marital=="widowed" ]<-0
table(dd$married)

names(table(dd$marital, exclude=NULL) )##put other in NA category
single<-c(  "divorced"     ,  "single.nevermar"  , "separated" ,  "widowed"   )
not.single<-c("legal.dom.part" , "married"   )
    

dd$single<-NA
dd$single[dd$marital%in%single]<-1
dd$single[dd$marital%in%not.single]<-0
table(dd$single, exclude=NULL)
mean(dd$single,na.rm=T)##44 % of people single




##recode ideology
dd$ideol<-NA
dd$ideol[dd$libcon=="1.cons"]<-1
dd$ideol[dd$libcon=="2.mod"]<-2
dd$ideol[dd$libcon=="3.lib"]<-3
table(dd$ideol)
table(dd$libcon)
table(dd$ideol)

##recode interest in politics
dd$likepol<-NA
dd$likepol[dd$likepolitics=="1.D"]<-1
dd$likepol[dd$likepolitics=="2.N"]<-2
dd$likepol[dd$likepolitics=="3.A"]<-3
table(dd$likepol)
table(dd$likepolitics)

##recode race vars to be dichotomous
##respondents were told to check all that apply, so the baseline of zeros will be those that gave a 1 to at least one answer.
##coding here as non-hispanic white, black, asian, etc
dd$white2<-NA
dd$white2[dd$white=="1"|dd$black=="1"|  dd$hisp=="1"| dd$asian=="1"| dd$pac.island=="1"| dd$native.amer=="1"]<-0
dd$white2[dd$white=="1" & dd$hisp!="1" ]<-1

dd$black2<-NA
dd$black2[dd$white=="1"|dd$black=="1"|  dd$hisp=="1"| dd$asian=="1"| dd$pac.island=="1"| dd$native.amer=="1"]<-0
dd$black2[dd$black=="1" & dd$hisp!="1"]<-1

dd$hisp2<-NA
dd$hisp2[dd$white=="1"|dd$black=="1"|  dd$hisp=="1"| dd$asian=="1"| dd$pac.island=="1"| dd$native.amer=="1"]<-0
dd$hisp2[dd$hisp=="1"]<-1

dd$asian2<-NA
dd$asian2[dd$white=="1"|dd$black=="1"|  dd$hisp=="1"| dd$asian=="1"| dd$pac.island=="1"| dd$native.amer=="1"]<-0
dd$asian2[dd$asian=="1" & dd$hisp!="1"]<-1

dd$pac.island2<-NA
dd$pac.island2[dd$white=="1"|dd$black=="1"|  dd$hisp=="1"| dd$asian=="1"| dd$pac.island=="1"| dd$native.amer=="1"]<-0
dd$pac.island2[dd$pac.island=="1" & dd$hisp!="1"]<-1

dd$native.amer2<-NA
dd$native.amer2[dd$white=="1"|dd$black=="1"|  dd$hisp=="1"| dd$asian=="1"| dd$pac.island=="1"| dd$native.amer=="1"]<-0
dd$native.amer2[dd$native.amer=="1" & dd$hisp!="1"]<-1

##did the resp. move in the past 5 years?
table(dd$homelast5)
dd$homelast5<-as.n(as.c(dd$homelast5))

##white democrat indicator
dd$white.dem<-NA
dd$white.dem[dd$partywlean=="dem" & dd$white2==1]<-1
dd$white.dem[dd$partywlean=="dem" & dd$white2!=1 & !is.na(dd$white2) & !is.na(dd$partywlean)]<-0##non-white dems
table(dd$white.dem,exclude=NULL)
mean(dd$white.dem, na.rm=T)##76% of democrats are nh white in sample


dim(dd)



########
#######
###### POLICY ITEMS
#######
##########

policy.names<-c("plcy.morelanes", "plcy.morebus", "plcy.morerail", "plcy.morehwy","plcy.usegastax","plcy.cartaxbreak","plcy.mortgagetax","plcy.poortaxbreak")

new.policy.names<-c("lanes","bus","rail","hwy","gas.tax","car.tax","mortgage","poor.tax.break")


for (i in 1:len(policy.names)){
print(table(dd[,policy.names[i]]))
print(class(dd[,policy.names[i]]))
}

for (i in 1:len(policy.names)){
	dd[,new.policy.names[i]]<-NA
	}
	
dd$lanes[dd$plcy.morelanes=="1.SO"|dd$plcy.morelanes=="2.SMO"]<-1
dd$lanes[dd$plcy.morelanes=="4.SMS"|dd$plcy.morelanes=="5.SS"|dd$plcy.morelanes=="3.N" ]<-0
		
dd$bus[dd$plcy.morebus=="1.SO"|dd$plcy.morebus=="2.SMO"|dd$plcy.morebus=="3.N" ]<-0
dd$bus[dd$plcy.morebus=="4.SMS"|dd$plcy.morebus=="5.SS"]<-1
	
dd$rail[dd$plcy.morerail=="1.SO"|dd$plcy.morerail=="2.SMO"|dd$plcy.morerail=="3.N" ]<-0
dd$rail[dd$plcy.morerail=="4.SMS"|dd$plcy.morerail=="5.SS"]<-1

dd$hwy[dd$plcy.morehwy=="1.SO"|dd$plcy.morehwy=="2.SMO"]<-1
dd$hwy[dd$plcy.morehwy=="4.SMS"|dd$plcy.morehwy=="5.SS"|dd$plcy.morehwy=="3.N" ]<-0

dd$gas.tax[dd$plcy.usegastax=="1.SO"|dd$plcy.usegastax=="2.SMO"|dd$plcy.usegastax=="3.N" ]<-0
dd$gas.tax[dd$plcy.usegastax=="4.SMS"|dd$plcy.usegastax=="5.SS"]<-1

dd$car.tax[dd$plcy.cartaxbreak=="1.SO"|dd$plcy.cartaxbreak=="2.SMO"]<-1
dd$car.tax[dd$plcy.cartaxbreak=="4.SMS"|dd$plcy.cartaxbreak=="5.SS"|dd$plcy.cartaxbreak=="3.N" ]<-0

dd$mortgage[dd$plcy.mortgagetax=="1.SO"|dd$plcy.mortgagetax=="2.SMO"|dd$plcy.mortgagetax=="3.N" ]<-0
dd$mortgage[dd$plcy.mortgagetax=="4.SMS"|dd$plcy.mortgagetax=="5.SS"]<-1

dd$poor.tax.break[dd$plcy.poortaxbreak=="1.SO"|dd$plcy.poortaxbreak=="2.SMO"|dd$plcy.poortaxbreak=="3.N" ]<-0
dd$poor.tax.break[dd$plcy.poortaxbreak=="4.SMS"|dd$plcy.poortaxbreak=="5.SS"]<-1


for (i in 1:len(policy.names)){
	print(table(dd[,new.policy.names[i]]))
	}


ssi.policy.labels<-c("Against More Highway Lanes in Suburbs", "Expand Urban Bus Service into Suburbs", "Against More High-Speed Rail","More Highway Lanes in Cities", "Use Gas Tax to Pay for Transit", "Against Tax Break for Car Commute", "Tax Break for Mortgage Interest","Assist Poor w/ Transit Costs"     )

len(ssi.policy.labels)==len(new.policy.names)


for (i in 1:len(ssi.policy.labels)){
	dd[,ssi.policy.labels[i]]<-NA
	}

for (i in 1:len(policy.names)){
	dd[,ssi.policy.labels[i]]<-dd[,new.policy.names[i]]
	print(table(dd[,ssi.policy.labels[i]] ))
	}


#####
#### GENERATE INDICES OF SORTED PARTISANSHIP
#####
#####


#### INDEX 1: Did respondent give the "right" answer to the ideology item?

table(dd$ideol)##1 = conservative; 2 = moderate, 3 = liberal
class(dd$ideol)

##code new variables to ID Democrats who said they were liberal; Republicans who said they were conservative

dd$libdem<-NA
dd$libdem[dd$partywlean=="dem" & dd$ideol==3]<-1
dd$libdem[dd$partywlean=="dem" & dd$ideol!=3 & !is.na(dd$partywlean) & !is.na(dd$ideol)]<-0
table(dd$libdem)
table(dd$partywlean, dd$ideol)


dd$conrep<-NA
dd$conrep[dd$partywlean =="rep" & dd$ideol==1]<-1
dd$conrep[dd$partywlean=="rep" & dd$ideol!=1 & !is.na(dd$partywlean) & !is.na(dd$ideol)]<-0
table(dd$conrep)
table(dd$partywlean, dd$ideol)

dim(dd)

##DROP FOREIGN RESPONDENTS

###zip codes/region
##drop cases with definite foreign zip codes
dd<-dd[!dd$zip== "53732",]
dd<-dd[!dd$zip =="09801",]
dd<-dd[!dd$zip =="26115",]


dd$zip[dd$zip==""]<-NA ##i think clayton does this in his code too, but just to be sue

##code previous zip codes
dd$lastzip.text<-as.c(dd$lastzip.text)
dd$lastzip.text[dd$lastzip.text==""]<-NA
table(dd$lastzip.text)
dd$lastzip.text[nchar(dd$lastzip.text)==6]

# returns string w/o leading or trailing whitespace
#sourc: http://stackoverflow.com/questions/2261079/how-to-trim-leading-and-trailing-whitespace-in-r
trim <- function (x) gsub("^\\s+|\\s+$", "", x)

dd$lastzip.text<-trim(dd$lastzip.text)

table(nchar(dd$lastzip.text))
dd$lastzip.text[nchar(dd$lastzip.text)==6]
dd$lastzip.text[nchar(dd$lastzip.text)==2]
dd$lastzip.text[nchar(dd$lastzip.text)==1]
dd$lastzip.text[nchar(dd$lastzip.text)!=5]<-NA
dd$lastzip.text[nchar(dd$lastzip.text)==2]<-NA
table(nchar(dd$lastzip.text), exclude=NULL)
class(dd$lastzip.text)

class(dd$zip)
dd$zip<-trim(dd$zip)
table(nchar(dd$zip))
dd$zip[nchar(dd$zip)==4]##there is one 4 digit zip code here but it's unclear which character was left out by the respondent so will code as NA
dd$zip[nchar(dd$zip)==4]<-NA


##previous city corrections
table(dd$lastcity.text)
dd$prev.city<-as.character(dd$lastcity.text)
dd$prev.city[dd$prev.city=="adrian"]<-"Adrian"
dd$prev.city[dd$prev.city=="anaheim"]<-"Anaheim"
dd$prev.city[dd$prev.city=="ballimore"]<-"Baltimore" ###? could be austrailia
dd$prev.city[dd$prev.city=="billings"]<-"Billings"
dd$prev.city[dd$prev.city=="BARDSTOWN"]<-"Bardstown"
dd$prev.city[dd$prev.city=="charlotte"]<-"Charlotte"
dd$prev.city[dd$prev.city=="chester"]<-"Chester"
dd$prev.city[dd$prev.city=="china grove"]<-"China Grove"
dd$prev.city[dd$prev.city=="cincinnati"]<-"Cincinatti"
dd$prev.city[dd$prev.city=="Citrus heights"]<-"Citrus Heights"
dd$prev.city[dd$prev.city=="claremont"]<-"Claremont"
dd$prev.city[dd$prev.city=="cleveland"]<-"Cleveland"
dd$prev.city[dd$prev.city=="concord"]<-"Concord"
dd$prev.city[dd$prev.city=="dallas"]<-"Dallas"
dd$prev.city[dd$prev.city=="dayton"]<-"Dayton"
dd$prev.city[dd$prev.city=="delray beach"]<-"Delray Beach"
dd$prev.city[dd$prev.city=="des moines"]<-"Des Moines"
dd$prev.city[dd$prev.city=="farrell"]<-"Farrell"
dd$prev.city[dd$prev.city=="francestown"]<-"Francestown"
dd$prev.city[dd$prev.city=="GRAYSON"]<-"Grayson"
dd$prev.city[dd$prev.city=="hollywood"]<-"Hollywood"
dd$prev.city[dd$prev.city=="kenmore"]<-"Kenmore"
dd$prev.city[dd$prev.city=="killeen"]<-"Killeen"
dd$prev.city[dd$prev.city=="lakewood"]<-"Lakewood"
dd$prev.city[dd$prev.city=="las cruces"]<-"Las Cruces"
dd$prev.city[dd$prev.city=="lenoir"]<-"Lenoir"
dd$prev.city[dd$prev.city=="littleton"]<-"Littleton"
dd$prev.city[dd$prev.city=="lodi"]<-"Lodi"
dd$prev.city[dd$prev.city=="los angeles"]<-"Los Angeles"
dd$prev.city[dd$prev.city=="marlette"]<-"Marlette"
dd$prev.city[dd$prev.city=="miami"]<-"Miami"
dd$prev.city[dd$prev.city=="mountaintop"]<-"Mountaintop"
dd$prev.city[dd$prev.city=="new york"]<-"New York"
dd$prev.city[dd$prev.city=="oldsmar"]<-"Oldsmar"
dd$prev.city[dd$prev.city=="oxnard"]<-"Oxnard"
dd$prev.city[dd$prev.city=="pensacola"]<-"Pensacola"
dd$prev.city[dd$prev.city=="philadelphia"]<-"Philadelphia"
dd$prev.city[dd$prev.city=="pl"]<-NA ##this person put California as state. but no way to tell where in CA. Code as NA for now. Reassign to state centroid
dd$prev.city[dd$prev.city=="pleasetnon"]<-"Pleasanton"
dd$prev.city[dd$prev.city=="pompton lakes"]<-"Pompton Lakes"
dd$prev.city[dd$prev.city=="renfrew"]<-"Renfrew"
dd$prev.city[dd$prev.city=="riverside"]<-"Riverside"
dd$prev.city[dd$prev.city=="Rocky hill"]<-"Rocky Hill"
dd$prev.city[dd$prev.city=="ronkonkoma"]<-"Ronkonkoma"
dd$prev.city[dd$prev.city=="san antonio"]<-"San Antonio"
dd$prev.city[dd$prev.city=="san diego"]<-"San Antonio"
dd$prev.city[dd$prev.city=="san pedro"]<-"San Pedro"
dd$prev.city[dd$prev.city=="sandpoint"]<-"Sandpoint"
dd$prev.city[dd$prev.city=="timberlake"]<-"Timberlake"
dd$prev.city[dd$prev.city=="trenton"]<-"Trenton"
dd$prev.city[dd$prev.city=="venice"]<-"Venice"
dd$prev.city[dd$prev.city=="wanaque"]<-"Wanaque"
dd$prev.city[dd$prev.city=="WAXHAW"]<-"Waxhaw"
dd$prev.city[dd$prev.city=="west chester"]<-"Westchester"
dd$prev.city[dd$prev.city=="ballwin"]<-"Ballwin"
table(dd$prev.city)

##previous state corrections
table(dd$laststate.text)
dd$prev.state<-as.character(dd$laststate.text)
dd$prev.state[dd$prev.state=="As"]<-"Arizona" ##city with this entry is Mesa, and z and s are very close together on the keyboard, so I assume this is supposed to be Az
dd$prev.state[dd$prev.state=="Az."]<-"Arizona"
dd$prev.state[dd$prev.state=="Ca"]<-"California"
dd$prev.state[dd$prev.state=="CA"]<-"California"
dd$prev.state[dd$prev.state=="california"]<-"California"
dd$prev.state[dd$prev.state=="ca"]<-"California"
dd$prev.state[dd$prev.state=="co"]<-"Colorado"
dd$prev.state[dd$prev.state=="colorado"]<-"Colorado"
dd$prev.state[dd$prev.state=="ct"]<-"Connecticut"
dd$prev.state[dd$prev.state=="CT"]<-"Connecticut"
dd$prev.state[dd$prev.state=="fl"]<-"Florida"
dd$prev.state[dd$prev.state=="FL"]<-"Florida"
dd$prev.state[dd$prev.state=="florida"]<-"Florida"
dd$prev.state[dd$prev.state=="id"]<-"Idaho"
dd$prev.state[dd$prev.state=="il"]<-"Illinois"
dd$prev.state[dd$prev.state=="IL"]<-"Illinois"
dd$prev.state[dd$prev.state=="in"]<-"Indiana"
dd$prev.state[dd$prev.state=="IOWA"]<-"Iowa"
dd$prev.state[dd$prev.state=="KENTUCKY"]<-"Kentucky"
dd$prev.state[dd$prev.state=="Ky"]<-"Kentucky"
dd$prev.state[dd$prev.state=="KY"]<-"Kentucky"
dd$prev.state[dd$prev.state=="maryland"]<-"Maryaland"
dd$prev.state[dd$prev.state=="MD"]<-"Maryaland"
dd$prev.state[dd$prev.state=="mich"]<-"Michigan"
dd$prev.state[dd$prev.state=="mi"]<-"Michigan"
dd$prev.state[dd$prev.state=="michigan"]<-"Michigan"
dd$prev.state[dd$prev.state=="MN"]<-"Minnesota"
dd$prev.state[dd$prev.state=="MT"]<-"Montana"
dd$prev.state[dd$prev.state=="mo"]<-"Montana"
dd$prev.state[dd$prev.state=="nc"]<-"North Carolina"
dd$prev.state[dd$prev.state=="Nc"]<-"North Carolina"
dd$prev.state[dd$prev.state=="NC"]<-"North Carolina"
dd$prev.state[dd$prev.state=="NE"]<-"Nebraska"
dd$prev.state[dd$prev.state=="new york"]<-"New York"
dd$prev.state[dd$prev.state=="NY"]<-"New York"
dd$prev.state[dd$prev.state=="nh"]<-"New Hampshire"
dd$prev.state[dd$prev.state=="nj"]<-"New Jersey"
dd$prev.state[dd$prev.state=="NJ"]<-"New Jersey"
dd$prev.state[dd$prev.state=="nm"]<-"New Mexico"
dd$prev.state[dd$prev.state=="NM"]<-"New Mexico"
dd$prev.state[dd$prev.state=="ohio"]<-"Ohio"
dd$prev.state[dd$prev.state=="OH"]<-"Ohio"
dd$prev.state[dd$prev.state=="pa"]<-"Pennsylvania"
dd$prev.state[dd$prev.state=="PA"]<-"Pennsylvania"
dd$prev.state[dd$prev.state=="texas"]<-"Texas"
dd$prev.state[dd$prev.state=="TN"]<-"Tennessee"
dd$prev.state[dd$prev.state=="tx"]<-"Texas"
dd$prev.state[dd$prev.state=="TX"]<-"Texas"
dd$prev.state[dd$prev.state=="UT"]<-"Utah"
dd$prev.state[dd$prev.state=="va"]<-"Virginia"
dd$prev.state[dd$prev.state=="VA"]<-"Virginia"
dd$prev.state[dd$prev.state=="WA"]<-"Washington"
table(dd$prev.state)

##previous country corrections
table(dd$lastcountry.text)
dd$country<-as.character(dd$lastcountry.text)
dd$country[dd$country=="united states"]<-"USA"
dd$country[dd$country=="United States of America"]<-"USA"
dd$country[dd$country=="us"]<-"USA"
dd$country[dd$country=="US"]<-"USA"
dd$country[dd$country=="usa"]<-"USA"
dd$country[dd$country=="Sacramento"]<-"USA"
#dd$country[dd$country=="3043"]<-"NA"
#dd$country[dd$country=="7465"]<-"NA"
table(dd$country)

dim(dd)

##########
###### SCREENS
##pre-screen questions
table(dd$screen.pre.1)##1=True, 0=False...correct answer is FALSE
dd$pre.screen.1.c<-NA
dd$pre.screen.1.c[dd$screen.pre.1==0]<-1
dd$pre.screen.1.c[dd$screen.pre.1==1 & !is.na(dd$screen.pre.1)]<-0
table(dd$pre.screen.1.c)

table(dd$screen.pre.2)##1=True, 0=False...correct answer is FALSE
dd$pre.screen.2.c<-NA
dd$pre.screen.2.c[dd$screen.pre.2==0]<-1
dd$pre.screen.2.c[dd$screen.pre.2==1 & !is.na(dd$screen.pre.2)]<-0
table(dd$pre.screen.2.c)


##post screen questions

##make first post screen question numeric
dd$post.screen1<-as.n(as.c(dd$post.screen1))
table(dd$post.screen1)

##generate second screen var
dd$post.screen2.new<-NA
dd$post.screen2.new[dd$post.screen2!=""]<-1
dd$post.screen2.new[dd$post.screen2==""]<-0
table(dd$post.screen2.new)

##make variable for those who failed both post-screens
dd$both.post.correct<-NA
dd$both.post.correct[dd$post.screen1==1 & dd$post.screen2.new==1]<-1
dd$both.post.correct[(dd$post.screen1!=1 | dd$post.screen2.new!=1)==TRUE & !is.na(dd$post.screen1 ) &!is.na(dd$post.screen2.new ) ]<-0
table(dd$both.post.correct)

##drop people who did not pass pre-screens
dd<-subset(dd, dd$pre.screen.1.c==1 & dd$pre.screen.2.c==1)
dim(dd)

####drop non-partisan respondents
dd<-subset(dd, dd$partywlean=="dem"|dd$partywlean=="rep")
dim(dd)




###zip code data
##generate function to trim leading and trailing white space from strings
trim <- function (x) gsub("^\\s+|\\s+$", "", x)


dd.zip<-read.csv(paste(wd, "data/zipcode.inced.csv", sep=""),  header=T, as.is=T)

##change zipcodes to characters
dd.zip$zip.c<-as.character(dd.zip$zip)




dd.zip$zip.c<-trim(dd.zip$zip.c)

table(nchar(dd.zip$zip.c))


##add leading zeros to 4-character zips only
for (i in 1:len(dd.zip$zip.c)){
	if(nchar(dd.zip$zip.c[i])==4){dd.zip$zip.c[i]<-paste("0",dd.zip$zip.c[i],sep="")}
	}
	
dd.zip$zip<-dd.zip$zip.c

table(nchar(dd.zip$zip))

dd.zip$zip[grep("8370", dd.zip$zip)]

dim(dd)
dd<-merge(dd, dd.zip, by="zip", all.x =T, sort=F)
dim(dd)


dd$zip[grep("8370", dd$zip)]


##generate superzips variable
z.inc.q<-quantile(dd.zip$percapinc11, prob=c(0, 0.2,0.4,0.6,0.8,1), na.rm=T)
dd$percapinc11grp<-cut(dd$percapinc11, as.numeric(z.inc.q))
dd$topincgrp<-as.n(as.n(dd$percapinc11grp)==5)
dd$topedgrp<-as.numeric(dd$prop.college>quantile(dd.zip$prop.college,c(0.8), na.rm=T))
dd$superzip<-as.numeric(dd$topincgrp & dd$topedgrp)
table(dd$superzip)

##gen a richzip variable - just rich zips, no ed component
z.inc.q<-quantile(dd.zip$percapinc11, prob=c(0, 0.2,0.4,0.6,0.8,1), na.rm=T)
dd$percapinc11grp<-cut(dd$percapinc11, as.numeric(z.inc.q))
dd$richzip<-as.n(as.n(dd$percapinc11grp)==5)
table(dd$richzip)



dd.final<-dd
dim(dd)

table(dd$democrat)
table(dd$republican)
table(dd$democrat==0 & dd$republican==0)##no more independents


##save the data frame
save(dd.final, file="data/dd.final.Rdata")






#############################################
#############################################
#############################################
#############################################


##### ANES DATA


#############################################
#############################################
#############################################
#############################################


load("data/dd.final.Rdata")
dd<-dd.final
dim(dd)

as.n<-as.numeric
as.c<-as.character
as.f<-as.formula
len<-length



## load stata version of ANES, has some incorrect labels
anes.2012.file<-paste(wd, "data/anes_timeseries_2012.dta", sep="")
anes.12<-read.dta(anes.2012.file)
dim(anes.12)
anes.12$surveyID<-"2012"


####ID 2012 variables of interest

##PARTY ID, 
##our data:

##generate DEMOCRAT DUMMY
table(dd$partywlean)
table(dd$democrat)

##REPUBLICAN DUMMY
table(dd$partywlean)
table(dd$republican)

##anes 2012 pre-election
##pid_self

anes.12$pid_self<-as.c(anes.12$pid_self)
table(anes.12$pid_self)##initial question
anes.12$pid_lean<-as.c(anes.12$pid_lean)
table(anes.12$pid_lean)##leaner question
anes.12$partywlean<-NA
anes.12$partywlean[anes.12$pid_self=="1. Democrat"]<-"dem"
anes.12$partywlean[anes.12$pid_self=="2. Republican"]<-"rep"
table(anes.12$partywlean)
##add the leaners
## rep leaners
anes.12$partywlean[anes.12$pid_lean=="1. Closer to Republican"]<-"rep"
##dem leaners
anes.12$partywlean[anes.12$pid_lean=="3. Closer to Democratic"]<-"dem"
table(anes.12$partywlean)##this tabulation has one additional Dem than the tabulation in the codebook; reason unknown

##DEMOCRAT DUMMY
anes.12$democrat<-NA
anes.12$democrat[anes.12$partywlean=="dem"]<-1
anes.12$democrat[anes.12$partywlean!="dem" & !is.na(anes.12$partywlean) ]<-0
table(anes.12$democrat)
#REPUBLICAN DUMMY
anes.12$republican<-NA
anes.12$republican[anes.12$partywlean=="rep"]<-1
anes.12$republican[anes.12$partywlean!="rep" & !is.na(anes.12$partywlean) ]<-0
table(anes.12$republican)


 ##WHITE
##our data:
table(dd$white2) 

##anes12
anes.12$dem_raceeth<-as.c(anes.12$dem_raceeth)
table( anes.12$dem_raceeth)

anes.12$white<-NA
anes.12$white[anes.12$dem_raceeth=="1. White non-Hispanic"]<-1
anes.12$white[anes.12$dem_raceeth!="1. White non-Hispanic" & anes.12$dem_raceeth!="-8. Don't know" & anes.12$dem_raceeth!="-9. Refused" & !is.na(anes.12$dem_raceeth)]<-0
table(anes.12$white)

##BLACK
##our data:
table(dd$black2)
##anes.12
anes.12$black<-NA
anes.12$black[anes.12$dem_raceeth=="2. Black non-Hispanic"]<-1
anes.12$black[anes.12$dem_raceeth!="2. Black non-Hispanic" & anes.12$dem_raceeth!="-8. Don't know" & anes.12$dem_raceeth!="-9. Refused"  & !is.na(anes.12$dem_raceeth)]<-0
table(anes.12$black)


##HISPANIC/LATINO (some of these will seldf ID as white)
#our data:
table(dd$hisp2)
##anes12
anes.12$hisp<-NA
anes.12$hisp[anes.12$dem_raceeth=="3. Hispanic"]<-1
anes.12$hisp[anes.12$dem_raceeth!="3. Hispanic" & anes.12$dem_raceeth!="-8. Don't know" & anes.12$dem_raceeth!="-9. Refused" &  !is.na(anes.12$dem_raceeth)]<-0
table(anes.12$hisp)
 

 ##our data: ours is age in exact years
 table(dd$age)
 ##anes 2012:
  ## will recode ANES data as age group midpoints (exact age in restricted ANES data only)
anes.12$dem_agegrp_iwdate<-as.c(anes.12$dem_agegrp_iwdate)
table(anes.12$dem_agegrp_iwdate)

anes.12$age<-NA
anes.12$age[anes.12$dem_agegrp_iwdate=="01. Age group 17-20"]<-(20+17)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="02. Age group 21-24"]<-(21+24)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="03. Age group 25-29"]<-(25+29)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="04. Age group 30-34"]<-(30+34)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="05. Age group 35-39"]<-(35+39)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="06. Age group 40-44"]<-(40+44)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="07. Age group 45-49"]<-(45+49)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="08. Age group 50-54"]<-(50+54)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="09. Age group 55-59"]<-(55+59)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="10. Age group 60-64"]<-(60+64)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="11. Age group 65-69"]<-(65+69)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="12. Age group 70-74"]<-(70+74)/2
anes.12$age[anes.12$dem_agegrp_iwdate=="13. Age group 75 or older"]<-(75+99)/2 ##arbitrary topcoding here of 99 years
table(anes.12$age )


##GENDER
##our data:
table(dd$female)
##anes 2012:
anes.12$gender_respondent<-as.c(anes.12$gender_respondent)
table(anes.12$gender_respondent)
anes.12$female<-NA
anes.12$female[anes.12$gender_respondent=="2. Female" ]<-1
anes.12$female[anes.12$gender_respondent!="2. Female" & !is.na ( anes.12$gender_respondent) ]<-0
table(anes.12$female)

##Ideology, pre-election
##our data: 1=con 2=mod 3=lib
table(dd$ideol)
##anes 2012
anes.12$libcpre_self<-as.c(anes.12$libcpre_self)
table(anes.12$libcpre_self)
anes.12$libcpre_choose<-as.c(anes.12$libcpre_choose)
table( anes.12$libcpre_choose)
anes.12$ideol<-NA
anes.12$ideol[anes.12$libcpre_self=="1. Extremely liberal"|anes.12$libcpre_self=="2. Liberal"| anes.12$libcpre_self=="3. Slightly liberal"| anes.12$libcpre_choose=="1. Liberal"]<-3
anes.12$ideol[anes.12$libcpre_self=="4. Moderate; middle of the road" & anes.12$libcpre_choose=="3. Moderate {VOL}"  ]<-2
anes.12$ideol[anes.12$libcpre_self=="5. Slightly conservative"|anes.12$libcpre_self=="5. Slightly conservative"|anes.12$libcpre_self=="6. Conservative"|anes.12$libcpre_self=="7. Extremely conservative"|anes.12$libcpre_choose=="2. Conservative"]<-1
table(anes.12$ideol)


##INTEREST IN POLITICS
##in our data: 1=no 2=neutral 3 = yes
table(dd$likepol)
##nes data:
##interest_attention "How often do you pay attention to what's going on in government and politics? [ALWAYS, MOST OF THE TIME, ABOUT HALF THE TIME, SOME OF THE TIME, or NEVER / NEVER, SOME OF THE TIME, ABOUT HALF THE TIME, MOST OF THE TIME, or ALWAYS]?....this is a 5-point scale, harder to sync with ours, but probably substantively closer in question wording
#
#anes.12$interest_attention<-as.c(anes.12$interest_attention)
#table(anes.12$interest_attention)

## interest_following "Some people don't pay much attention to political campaigns. How about you? Would you say that you have been [VERY MUCH inter- ested, SOMEWHAT interested or NOT MUCH interested/ NOT MUCH interested, SOMEWHAT interested or VERY MUCH interested] in the political campaigns so far this year?"...This is already a 3-point scale, but is year-specific

anes.12$interest_following<-as.c(anes.12$interest_following)
table(anes.12$interest_following)
anes.12$likepol<-NA
anes.12$likepol[ anes.12$interest_following=="1. Very much interested"]<-3
anes.12$likepol[ anes.12$interest_following=="2. Somewhat interested"]<-2
anes.12$likepol[ anes.12$interest_following=="3. Not much interested"]<-1
table(anes.12$likepol)

##did R vote?
##our data
table(dd$vote)

##nes 12:
anes.12$rvote2012_x<-as.c(anes.12$rvote2012_x)
table(anes.12$rvote2012_x )
anes.12$vote<-NA
anes.12$vote[ anes.12$rvote2012_x=="2. R did not vote in the 2012 elections"]<-0
anes.12$vote[ anes.12$rvote2012_x=="1. R voted in the 2012 elections"]<-1
table(anes.12$vote)

###EDUCATION
##our data:
table(dd$educ.years)

##nes.12, pre-election:
anes.12$dem_edugroup<-as.c(anes.12$dem_edugroup)
table(anes.12$dem_edugroup)
##recode into years
anes.12$educ.years<-NA
anes.12$educ.years[anes.12$dem_edugroup=="1. Less than high school credential"]<-8
anes.12$educ.years[anes.12$dem_edugroup=="2. High school credential"]<-12
anes.12$educ.years[anes.12$dem_edugroup=="3. Some post-high-school, no bachelor's degree"]<-13
anes.12$educ.years[anes.12$dem_edugroup=="4. Bachelor's degree"]<-16
anes.12$educ.years[anes.12$dem_edugroup=="5. Graduate degree"]<-18
table(anes.12$educ.years)

###Marital status
##our data
table(dd$married)
##nes12:, pre election
##recode as married or not
anes.12$dem_marital<-as.c(anes.12$dem_marital)
table(anes.12$dem_marital)
anes.12$married<-NA
anes.12$married[anes.12$dem_marital=="1. Married: spouse present"]<-1
anes.12$married[anes.12$dem_marital=="2. FTF ONLY: Married: spouse absent {VOL}"]<-1
anes.12$married[anes.12$dem_marital=="3. Widowed"| anes.12$dem_marital=="4. Divorced"| anes.12$dem_marital=="5. Separated"| anes.12$dem_marital=="6. Never married"]<-0
table(anes.12$married)

##OWN or RENT HOME?
##our data:
table(dd$ownrent2)
##nes12:
anes.12$dem3_ownhome<-as.c(anes.12$dem3_ownhome)
table(anes.12$dem3_ownhome )
anes.12$ownrent2<-NA
anes.12$ownrent2[anes.12$dem3_ownhome =="1. Own home"]<-1
anes.12$ownrent2[anes.12$dem3_ownhome =="2. Pay rent"]<-0
table(anes.12$ownrent2)


####RELIGIOUS ID (major categories).....restricted in ANES
anes.12$relig_groupna<-as.c(anes.12$relig_groupna)
table( anes.12$relig_groupna)

##INCOME - restricted in ANES

###CENSUS REGION 

##our data:
##add regional dummies to dd
##load zip code shape file
file.name<-paste(wd, "/data/zipjoincounties.dbf", sep="")
zip.county<-read.dbf(file.name)$dbf
head(zip.county)
##northeast dummy
##Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island and Vermont, New Jersey, New York and Pennsylvania
zip.county$northeast<-0
zip.county$northeast[zip.county$STATE_ABBR=="CT"|zip.county$STATE_ABBR=="ME"|zip.county$STATE_ABBR=="MA"|zip.county$STATE_ABBR=="NH"|zip.county$STATE_ABBR=="RI"|zip.county$STATE_ABBR=="VT"|zip.county$STATE_ABBR=="NJ"|zip.county$STATE_ABBR=="NY"|zip.county$STATE_ABBR=="PA"]<-1
table(zip.county$northeast)

##midwest dummy
##Illinois, Indiana, Michigan, Ohio and Wisconsin, Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota and South Dakota
zip.county$midwest<-0
zip.county$midwest[zip.county$STATE_ABBR=="IL"|zip.county$STATE_ABBR=="IN"|zip.county$STATE_ABBR=="MI"|zip.county$STATE_ABBR=="OH"|zip.county$STATE_ABBR=="WI"|zip.county$STATE_ABBR=="IA"|zip.county$STATE_ABBR=="KS"|zip.county$STATE_ABBR=="MN"|zip.county$STATE_ABBR=="MO"|zip.county$STATE_ABBR=="NE"|zip.county$STATE_ABBR=="ND"|zip.county$STATE_ABBR=="SD"]<-1
table(zip.county$midwest)

##south dummy
##Delaware, District of Columbia, Florida, Georgia, Maryland, North Carolina, South Carolina, Virginia and West Virginia, Alabama, Kentucky, Mississippi and Tennessee, Arkansas, Louisiana, Oklahoma and Texas
zip.county$south<-0
zip.county$south[zip.county$STATE_ABBR=="DE"|zip.county$STATE_ABBR=="DC"|zip.county$STATE_ABBR=="FL"|zip.county$STATE_ABBR=="GA"|zip.county$STATE_ABBR=="MD"|zip.county$STATE_ABBR=="NC"|zip.county$STATE_ABBR=="SC"|zip.county$STATE_ABBR=="VA"|zip.county$STATE_ABBR=="WV"|zip.county$STATE_ABBR=="AL"|zip.county$STATE_ABBR=="KY"|zip.county$STATE_ABBR=="MS"|zip.county$STATE_ABBR=="TN"|zip.county$STATE_ABBR=="AR"|zip.county$STATE_ABBR=="LA"|zip.county$STATE_ABBR=="OK"|zip.county$STATE_ABBR=="TX"]<-1
table(zip.county$south)

##west dummy
##Arizona, Colorado, Idaho, Montana, Nevada, New Mexico, Utah and Wyoming, Alaska, California, Hawaii, Oregon and Washington
zip.county$west<-0
zip.county$west[zip.county$STATE_ABBR=="AZ"|zip.county$STATE_ABBR=="CO"|zip.county$STATE_ABBR=="ID"|zip.county$STATE_ABBR=="MT"|zip.county$STATE_ABBR=="NV"|zip.county$STATE_ABBR=="NM"|zip.county$STATE_ABBR=="UT"|zip.county$STATE_ABBR=="WY"|zip.county$STATE_ABBR=="AK"|zip.county$STATE_ABBR=="CA"|zip.county$STATE_ABBR=="HI"|zip.county$STATE_ABBR=="OR"|zip.county$STATE_ABBR=="WA"]<-1
table(zip.county$west)

##do they add to 1?
mean(zip.county$west, na.rm=TRUE) + mean(zip.county$south, na.rm=TRUE) + mean(zip.county$northeast, na.rm=TRUE) + mean(zip.county$midwest, na.rm=TRUE)

class(dd$zip)
class(zip.county$ZIP)
zip.county$zip<-as.c(zip.county$ZIP)
class(zip.county$zip)

###merge in regional dummy variables
dd<-merge(dd, zip.county, by="zip", all.x =T, sort=F)
dim(dd)




table(dd$northeast)
table(dd$west)
table(dd$south)
table(dd$midwest)


##check for mutual exclusivity
dd$region.check<-paste(as.c(dd$northeast), as.c(dd$midwest), as.c(dd$south), as.c(dd$west), sep="" )
table(dd$region.check)##there are 1367 NAs (bad zip codes in original data)
table(dd$region.check[dd$partywlean=="dem"|dd$partywlean=="rep"])##there are only 32 bad zip codes in the data we are using (the partisan data)



##replace bad zips that are known with correct region:
dd$west[dd$zip=="84081"|dd$zip=="84083"|dd$zip=="85142"|dd$zip=="84129"|dd$zip=="85083"|dd$zip=="85119"|dd$zip=="85122"|dd$zip=="85123"|dd$zip=="85139"|dd$zip=="85140"|dd$zip=="85295"|dd$zip=="85392"]<-1
dd$south[dd$zip=="84081"|dd$zip=="84083"|dd$zip=="85142"|dd$zip=="84129"|dd$zip=="85083"|dd$zip=="85119"|dd$zip=="85122"|dd$zip=="85123"|dd$zip=="85139"|dd$zip=="85140"|dd$zip=="85295"|dd$zip=="85392"]<-0
dd$midwest[dd$zip=="84081"|dd$zip=="84083"|dd$zip=="85142"|dd$zip=="84129"|dd$zip=="85083"|dd$zip=="85119"|dd$zip=="85122"|dd$zip=="85123"|dd$zip=="85139"|dd$zip=="85140"|dd$zip=="85295"|dd$zip=="85392"]<-0
dd$northeast[dd$zip=="84081"|dd$zip=="84083"|dd$zip=="85142"|dd$zip=="84129"|dd$zip=="85083"|dd$zip=="85119"|dd$zip=="85122"|dd$zip=="85123"|dd$zip=="85139"|dd$zip=="85140"|dd$zip=="85295"|dd$zip=="85392"]<-0

dd$midwest[dd$zip=="60642"|dd$zip=="49037"]<-1
dd$northeast[dd$zip=="60642"|dd$zip=="49037"]<-0
dd$west[dd$zip=="60642"|dd$zip=="49037"]<-0
dd$south[dd$zip=="60642"|dd$zip=="49037"]<-0

dd$northeast[dd$zip=="10065"]<-1
dd$south[dd$zip=="10065"]<-0
dd$midwest[dd$zip=="10065"]<-0
dd$west[dd$zip=="10065"]<-0

dd$south[dd$zip=="29707"|dd$zip=="33449"|dd$zip=="33545"|dd$zip=="33579"|dd$zip=="33974"|dd$zip=="73025"|dd$zip=="77407"|dd$zip=="7766/"|dd$zip=="78665"|dd$zip=="78633"]<-1
dd$northeast[dd$zip=="29707"|dd$zip=="33449"|dd$zip=="33545"|dd$zip=="33579"|dd$zip=="33974"|dd$zip=="73025"|dd$zip=="77407"|dd$zip=="7766/"|dd$zip=="78665"|dd$zip=="78633"]<-0
dd$west[dd$zip=="29707"|dd$zip=="33449"|dd$zip=="33545"|dd$zip=="33579"|dd$zip=="33974"|dd$zip=="73025"|dd$zip=="77407"|dd$zip=="7766/"|dd$zip=="78665"|dd$zip=="78633"]<-0
dd$midwest[dd$zip=="29707"|dd$zip=="33449"|dd$zip=="33545"|dd$zip=="33579"|dd$zip=="33974"|dd$zip=="73025"|dd$zip=="77407"|dd$zip=="7766/"|dd$zip=="78665"|dd$zip=="78633"]<-0

##note: "7766/" recoded to "south" because it uses a Texas prefix

##do they add to 1? Yes, now they do.
mean(dd$west, na.rm=TRUE) + mean(dd$south, na.rm=TRUE) + mean(dd$northeast, na.rm=TRUE) + mean(dd$midwest, na.rm=TRUE)

##re-tabulate the regions
dd$region.check<-paste(as.c(dd$northeast), as.c(dd$midwest), as.c(dd$south), as.c(dd$west), sep="" )
table(dd$region.check)##there are 1335 NAs (bad zip codes in original data)
table(dd$region.check[dd$partywlean=="dem"|dd$partywlean=="rep"])
##bad zips in the data we are using:
dd$zip[dd$region.check=="NANANANA"&dd$partywlean=="dem"]
dd$zip[dd$region.check=="NANANANA"&dd$partywlean=="rep"]
##there are only 2 bad zip codes in the data we are using (the partisan data). These are " 8370" and "00000"...These will be excluded from regional tests.


##anes 2012:
##regions based on this: http://www.census.gov/econ/census07/www/geography/regions_and_divisions.html

anes.12$sample_state<-as.c(anes.12$sample_state)
table(anes.12$sample_state, exclude=F)

##northeast dummy
##Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island and Vermont, New Jersey, New York and Pennsylvania
anes.12$northeast<-0
anes.12$northeast[anes.12$sample_state=="MA"|anes.12$sample_state=="CT"|anes.12$sample_state=="ME"|anes.12$sample_state=="NH"|anes.12$sample_state=="RI"|anes.12$sample_state=="VT"|anes.12$sample_state=="NJ"|anes.12$sample_state=="NY"|anes.12$sample_state=="PA"]<-1
table(anes.12$northeast)

##midwest dummy
##Illinois, Indiana, Michigan, Ohio and Wisconsin, Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota and South Dakota
anes.12$midwest<-0
anes.12$midwest[anes.12$sample_state=="IL"|anes.12$sample_state=="IN"|anes.12$sample_state=="MI"|anes.12$sample_state=="OH"|anes.12$sample_state=="WI"|anes.12$sample_state=="IA"|anes.12$sample_state=="KS"|anes.12$sample_state=="MN"|anes.12$sample_state=="MO"|anes.12$sample_state=="NE"|anes.12$sample_state=="ND"|anes.12$sample_state=="SD"]<-1
table(anes.12$midwest)

##south dummy
##Delaware, District of Columbia, Florida, Georgia, Maryland, North Carolina, South Carolina, Virginia and West Virginia, Alabama, Kentucky, Mississippi and Tennessee, Arkansas, Louisiana, Oklahoma and Texas
anes.12$south<-0
anes.12$south[anes.12$sample_state=="DE"|anes.12$sample_state=="DC"|anes.12$sample_state=="FL"|anes.12$sample_state=="GA"|anes.12$sample_state=="MD"|anes.12$sample_state=="NC"|anes.12$sample_state=="SC"|anes.12$sample_state=="VA"|anes.12$sample_state=="WV"|anes.12$sample_state=="AL"|anes.12$sample_state=="KY"|anes.12$sample_state=="MS"|anes.12$sample_state=="TN"|anes.12$sample_state=="AR"|anes.12$sample_state=="LA"|anes.12$sample_state=="OK"|anes.12$sample_state=="TX"]<-1
table(anes.12$south)

##west dummy
##Arizona, Colorado, Idaho, Montana, Nevada, New Mexico, Utah and Wyoming, Alaska, California, Hawaii, Oregon and Washington
anes.12$west<-0
anes.12$west[anes.12$sample_state=="AZ"|anes.12$sample_state=="CO"|anes.12$sample_state=="ID"|anes.12$sample_state=="MT"|anes.12$sample_state=="NV"|anes.12$sample_state=="NM"|anes.12$sample_state=="UT"|anes.12$sample_state=="WY"|anes.12$sample_state=="AK"|anes.12$sample_state=="CA"|anes.12$sample_state=="HI"|anes.12$sample_state=="OR"|anes.12$sample_state=="WA"]<-1
table(anes.12$west)

table(paste(as.c(anes.12$northeast), as.c(anes.12$midwest), as.c(anes.12$south), as.c(anes.12$west), sep="" ))##looks good

##do they add to 1? Yes, they do.
mean(anes.12$west, na.rm=TRUE) + mean(anes.12$south, na.rm=TRUE) + mean(anes.12$northeast, na.rm=TRUE) + mean(anes.12$midwest, na.rm=TRUE)

dd.final<-dd
save(dd.final, file="data/dd.final.Rdata")
save(anes.12, file="data/anes.12.regions.Rdata")







#############################################
#############################################
#############################################
#############################################


##### BUILD ESTIMATION DATA FOR HISTOGRAMS SHOWING FEASABILITY OF SORTING


#############################################
#############################################
#############################################
#############################################


rm(list=ls()[ls()!="wd"])



as.n<-as.numeric
as.c<-as.character
as.f<-as.formula
len<-length





as.n<-as.numeric
as.c<-as.character
as.f<-as.formula
len<-length








####CODE UP CCES
###2012 CCES
load("data/commoncontent2012.Rdata")###load the 2012 CCES

dd<-x
dim(dd)
names(dd)

##party ID
table(dd$pid3, exclude=NULL)
table(dd$pid7others)


dd$dem<-0
dd$dem[dd$pid3=="Democrat"]<-1
dd$dem[dd$pid7others=="Lean Democrat"]<-1
table(dd$dem)


dd$rep<-0
dd$rep[dd$pid3=="Republican"]<-1
dd$rep[dd$pid7others=="Lean Republican"]<-1
table(dd$rep)

##gender
table(dd$gender, exclude=NULL)
dd$female<-NA
dd$female[dd$gender=="Male"]<-0
dd$female[dd$gender=="Female"]<-1
table(dd$female)

##age
table(dd$birthyr, exclude=NULL)
class(dd$birthyr)
dd$birthyr2<-dd$birthyr
dd$birthyr3<-as.n(as.c(dd$birthyr2))
sum(table(dd$birthyr3))
table(dd$birthyr3)
sum(table(dd$birthyr3))
class(dd$birthyr3)

dd$age<-2012-dd$birthyr3
table(dd$age)
mean(dd$age)

dd.dem<-subset(dd, dd$dem==1)
dd.rep<-subset(dd, dd$rep==1)

dem.age<-summary(lm(age~1, data=dd.dem, weights=dd.dem$V103))$coefficients[1]##[1] 45.29492
rep.age<-summary(lm(age~1, data=dd.rep, weights=dd.rep$V103))$coefficients[1]##[1] 49.33437

##18-29
dd$age1829<-NA
dd$age1829[dd$birthyr3<1983]<-0
dd$age1829[dd$birthyr3>=1983 & dd$birthyr3<=1994]<-1
table(dd$age1829)

dd$age3044<-NA
dd$age3044[dd$birthyr3<1968|dd$birthyr3>1982]<-0
dd$age3044[dd$birthyr3>=1968 & dd$birthyr3<=1982]<-1
table(dd$age3044)

dd$age4564<-0
dd$age4564[dd$birthyr3<1948|dd$birthyr3>1967]<-0
dd$age4564[dd$birthyr3>=1948 & dd$birthyr3<=1967]<-1
table(dd$age4564)

dd$age65plus<-0
dd$age65plus[dd$birthyr3<=1947]<-1
table(dd$age65plus)

mean(dd$age1829)+mean(dd$age3044)+mean(dd$age4564)+mean(dd$age65plus)

##age over 55 (social sec IAP)
dd$over55<-0
dd$over55[dd$age>55]<-1
table(dd$over55)

##race
##nhwhite
table(dd$race, exclude=NULL)
sum(table(dd$race))
table(dd$hispanic)
dd$nhwhite<-0##this is correct coding of 0
dd$nhwhite[dd$race=="White" & dd$race!="Hispanic" & dd$hispanic!="Yes"]<-1
table(dd$nhwhite)

##nhblack
dd$nhblack<-0##this is correct coding of 0
dd$nhblack[dd$race=="Black" & dd$race!="Hispanic" & dd$hispanic!="Yes"]<-1
table(dd$nhblack)

##hispanic/latino
dd$hisp<-0
dd$hisp[dd$hispanic=="Yes"]<-1
dd$hisp[dd$race=="Hispanic"]<-1
table(dd$hisp)

##other race
dd$other<-0
dd$other[dd$nhwhite!=1 & dd$nhblack!=1 & dd$hisp!=1]<-1
table(dd$other)##this is most restrictive way to do it (as opposed to adding each other category individually) because some hispanics get included that way

###Education
table(dd$educ)
dd$educ2<-as.c(dd$educ)
table(dd$educ2)
dd$BA<-0
dd$BA[dd$educ2=="4-year"|dd$educ2=="Post-grad"]<-1
table(dd$BA)
weighted.mean(dd$BA, w=)


###Income
table(dd$faminc, exclude=NULL)
dd$faminc2<-as.c(dd$faminc)
table(dd$faminc2)

dd$faminc3<-NA
dd$faminc3[dd$faminc2=="Less than $10,000"]<-5
dd$faminc3[dd$faminc2=="$10,000 - $19,999"]<-15
dd$faminc3[dd$faminc2=="$20,000 - $29,999"]<-25
dd$faminc3[dd$faminc2=="$30,000 - $39,999"]<-35
dd$faminc3[dd$faminc2=="$40,000 - $49,999"]<-45
dd$faminc3[dd$faminc2=="$50,000 - $59,999"]<-55
dd$faminc3[dd$faminc2=="$60,000 - $69,999"]<-65
dd$faminc3[dd$faminc2=="$70,000 - $79,999"]<-75
dd$faminc3[dd$faminc2=="$80,000 - $99,999"]<-90
dd$faminc3[dd$faminc2=="$100,000 - $119,999"]<-110
dd$faminc3[dd$faminc2=="$120,000 - $149,999"]<-130
dd$faminc3[dd$faminc2=="$150,000 - $199,999"]<-175
dd$faminc3[dd$faminc2=="$200,000 - $249,999"]<-225
dd$faminc3[dd$faminc2=="$250,000 - $349,999"]<-300
dd$faminc3[dd$faminc2=="$350,000 - $499,999"]<-425
dd$faminc3[dd$faminc2=="$500,000 or more"]<-500
table(dd$faminc3)

##housing cap
dd$faminc.house<-dd$faminc3*3*1000
table(dd$faminc.house)


##rental cap
dd$faminc.rent<-((dd$faminc3*1000)/12)*.3
table(dd$faminc.rent)


cces.income<-cbind.data.frame(dem=dd$dem, rep=dd$rep, faminc3=dd$faminc3, faminc.house=dd$faminc.house, famincrent=dd$faminc.rent, weight=dd$V103, zipnew=dd$lookupzip)
head(cces.income)
dim(cces.income)

save(cces.income, file="data/cces_2012_income.Rdata")







######LOAD IN ZCTA CENSUS DATA

zip<-read.csv("data/zipcode.2010.withpctr08andmsa.csv", header=TRUE)
head(zip)
dim(zip)



length(na.omit(zip$pctr08))

zip$msa<-as.character(zip$msa)


table(nchar(zip$zip))
##generate function to trim leading and trailing white space from strings
trim <- function (x) gsub("^\\s+|\\s+$", "", x)


zip$zipnew<-trim(as.character(zip$zip))

##make sure trailing zeros are present in the data
zip$zipnew[substr(zip$zipnew, start=nchar(zip$zipnew), stop=nchar(zip$zipnew))==0]
##they are present


##fill in leading zeros for zip codes so they are all five digits
for(i in 1:length(zip[,1])){

if(nchar(zip$zipnew[i])==3)
{zip$zipnew[i]<-paste("00",zip$zipnew[i],sep="")  } 


if(nchar(zip$zipnew[i])==4)
{zip$zipnew[i]<-paste("0",zip$zipnew[i],sep="")  } 
}
table(nchar(zip$zipnew)) ##now they are all 5 characters long


load("data/spatial_lag_groups.Rdata") ###load in file that contains neighboring zip codes for each zip code. object is called "nab" and is a list.
head(nab)

###make vector of desired census variables
census<-c("aggval.ownocc","units.ownocc","units.pctownocc","pop.zip.5yr.2012","pct.over65.5yr.2012","pctwht.5yr.2012","pctblkonly.5yr.2012","pctba.5yr.2012","pctr08","ownocc.val.q25","ownocc.val.med","ownocc.val.q75","ownoccunits","totunits","mediangrossrent","pctowned.5yr.2012","popdens.5yr.2012","dens10kup","dens5kup","pctowngreater70")

census2<-c("aggval.ownocc2","units.ownocc2","units.pctownocc2","pop.zip.5yr.20122","pct.over65.5yr.20122","pctwht.5yr.20122","pctblkonly.5yr.20122","pctba.5yr.20122","pctr082","ownocc.val.q252","ownocc.val.med2","ownocc.val.q752","ownoccunits2","totunits2","mediangrossrent2","pctowned.5yr.20122","popdens.5yr.20122","dens10kup2","dens5kup2","pctowngreater702")

##recode and add new vars to data frame
for(i in 1:length(census)){
	zip[,census[i]]<-as.numeric(zip[,census[i]])
	zip[,census2[i]]<-zip[,census[i]]
}







##DATA IMPUTATION: fill in the means calculated from the spatial lags for all census variables
for(k in 1:length(census2)  ){ ##for each variable in the census 2 vector
means<-NA ##make an empty vector to store results
for(j in which (is.na(zip[, census2[k] ] ))   ){ ###for each missing row of data in this variable's column

its<-which(is.na(zip[, census2[k] ] )) ##get vector of missing row indices

means[which(its==j)]<-mean(zip[zip$zipnew%in%nab[[zip$zipnew[j]]], census2[k]], na.rm=TRUE) ##compute the mean of this variable in all neighboring zip codes to the zip code in row j and store in the appropriate element of the vector means
		  
}

zip[  its , census2[k]] <-means ##deposit means in appropriate rows

##move on to next variable and repeat
}

save(zip, file="data/zip_apsa_imputed11.Rdata")



##spot check

## row 30843

check<-zip$zipnew[30843] ##get zip code in this row
mean(zip[zip$zipnew%in%nab[[check]],"ownocc.val.med2"], na.rm=T) == zip[zip$zipnew==check,"ownocc.val.med2"] ##should be true

##for an observation that came out missing
check<-zip$zipnew[27648] ##get zip code in this row
zip[zip$zipnew%in%nab[[check]],"pctowngreater702"]
mean(zip[zip$zipnew%in%nab[[check]],"pctowngreater702"], na.rm=T) == zip[zip$zipnew==check,"ownocc.val.med2"] ##should be NA


##take a look at summaries of these new variables
for(i in 1:length(census2)){
	
	
	print(summary(zip[,census2[i]]))
	
}


##########




##code up variables for affordability analysis based on rule of thumb here http://money.cnn.com/magazines/moneymag/money101/lesson8/  they say 2.5 times gross annual income and we are being more lenient by using 3x income
rm(list=ls()[ls()!="wd"])

load("data/zip_apsa_imputed11.Rdata")
head(zip)



 load("data/cces_2012_income.Rdata") 
 head(cces.income)

 table(cces.income$faminc.house)
 summary(zip$ownocc.val.med2)
 summary(zip$ownocc.val.q252)
 
 zip$dem.afford.house25<-NA
 zip$rep.afford.house25<-NA
 
 
 ##code each zip code in the census data with the share of CCES partisans that could afford the 25th percentile owner occupied house in that zip code based on the 3x rule of thumb
  
 for(i in 1:nrow(zip)){
 	if(!is.na(zip$ownocc.val.q252[i])){
 cces.income$cutoff<-NA
 cces.income$cutoff[cces.income$faminc.house  <zip$ownocc.val.q252[i] & cces.income$dem==1]<-0
 cces.income$cutoff[cces.income$faminc.house  >=zip$ownocc.val.q252[i] & cces.income$dem==1]<-1
 zip$dem.afford.house25[i]<- weighted.mean(cces.income$cutoff, w=cces.income$weight, na.rm=TRUE)##use population weights in computation

 
 
 cces.income$cutoff<-NA
 cces.income$cutoff[cces.income$faminc.house  <zip$ownocc.val.q252[i] & cces.income$rep==1]<-0
 cces.income$cutoff[cces.income$faminc.house  >=zip$ownocc.val.q252[i] & cces.income$rep==1]<-1
 zip$rep.afford.house25[i]<- weighted.mean(cces.income$cutoff, w=cces.income$weight, na.rm=TRUE)##use population weights in computation

 
 }
 }

save(zip, file="data/zip_apsa_imputed22.Rdata")


#save(zip, file="data/zip_apsa_imputed_final2.Rdata") ##this is a different version than in APSA 2014. Excludes unnecessary median calculations above.





###### CLEAN DATA / MAKE VARIABLES
load("data/zip_apsa_imputed22.Rdata")

### add states and regions:

state.dd<-read.csv("data/2010_zcta5_to_county_state.csv", header=TRUE, as.is=TRUE)
head(state.dd)
state.dd$zipnew<-as.character(state.dd$ZCTA5)

# # ##fill in leading zeros

table(nchar(state.dd$zipnew))



 for(i in 1:length(state.dd[,1])){

 if(nchar(state.dd$zipnew[i])==3)
 {state.dd$zipnew[i]<-paste("00",state.dd$zipnew[i],sep="")  } 


 if(nchar(state.dd$zipnew[i])==4)
 {state.dd$zipnew[i]<-paste("0",state.dd$zipnew[i],sep="")  } 
 }
table(nchar(state.dd$zipnew))

##eliminate duplicate zips

state.dd$dup<-duplicated(state.dd$zipnew)
table(state.dd$dup)
dim(state.dd)
state.dd<-state.dd[state.dd$dup==FALSE,]
dim(state.dd)


state.fips<-read.csv("data/state_fips_codes.csv", header=TRUE, as.is=TRUE)
head(state.fips)
state.fips$STATE<-state.fips$FIPS.State.Numeric.Code

state.dta<-merge(state.dd, state.fips, by="STATE", all.x=TRUE)
head(state.dta)
dim(state.dta)

state.dta$STATE_ABBR<-state.dta$Official.USPS.Code

##restricted in ANES

###CENSUS REGION 

##load zip code shape file
##northeast dummy
##Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island and Vermont, New Jersey, New York and Pennsylvania
state.dta$northeast<-0
state.dta$northeast[state.dta$STATE_ABBR=="CT"|state.dta$STATE_ABBR=="ME"|state.dta$STATE_ABBR=="MA"|state.dta$STATE_ABBR=="NH"|state.dta$STATE_ABBR=="RI"|state.dta$STATE_ABBR=="VT"|state.dta$STATE_ABBR=="NJ"|state.dta$STATE_ABBR=="NY"|state.dta$STATE_ABBR=="PA"]<-1
table(state.dta$northeast)

##midwest dummy
##Illinois, Indiana, Michigan, Ohio and Wisconsin, Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota and South Dakota
state.dta$midwest<-0
state.dta$midwest[state.dta$STATE_ABBR=="IL"|state.dta$STATE_ABBR=="IN"|state.dta$STATE_ABBR=="MI"|state.dta$STATE_ABBR=="OH"|state.dta$STATE_ABBR=="WI"|state.dta$STATE_ABBR=="IA"|state.dta$STATE_ABBR=="KS"|state.dta$STATE_ABBR=="MN"|state.dta$STATE_ABBR=="MO"|state.dta$STATE_ABBR=="NE"|state.dta$STATE_ABBR=="ND"|state.dta$STATE_ABBR=="SD"]<-1
table(state.dta$midwest)

##south dummy
##Delaware, District of Columbia, Florida, Georgia, Maryland, North Carolina, South Carolina, Virginia and West Virginia, Alabama, Kentucky, Mississippi and Tennessee, Arkansas, Louisiana, Oklahoma and Texas
state.dta$south<-0
state.dta$south[state.dta$STATE_ABBR=="DE"|state.dta$STATE_ABBR=="DC"|state.dta$STATE_ABBR=="FL"|state.dta$STATE_ABBR=="GA"|state.dta$STATE_ABBR=="MD"|state.dta$STATE_ABBR=="NC"|state.dta$STATE_ABBR=="SC"|state.dta$STATE_ABBR=="VA"|state.dta$STATE_ABBR=="WV"|state.dta$STATE_ABBR=="AL"|state.dta$STATE_ABBR=="KY"|state.dta$STATE_ABBR=="MS"|state.dta$STATE_ABBR=="TN"|state.dta$STATE_ABBR=="AR"|state.dta$STATE_ABBR=="LA"|state.dta$STATE_ABBR=="OK"|state.dta$STATE_ABBR=="TX"]<-1
table(state.dta$south)

##west dummy
##Arizona, Colorado, Idaho, Montana, Nevada, New Mexico, Utah and Wyoming, Alaska, California, Hawaii, Oregon and Washington
state.dta$west<-0
state.dta$west[state.dta$STATE_ABBR=="AZ"|state.dta$STATE_ABBR=="CO"|state.dta$STATE_ABBR=="ID"|state.dta$STATE_ABBR=="MT"|state.dta$STATE_ABBR=="NV"|state.dta$STATE_ABBR=="NM"|state.dta$STATE_ABBR=="UT"|state.dta$STATE_ABBR=="WY"|state.dta$STATE_ABBR=="AK"|state.dta$STATE_ABBR=="CA"|state.dta$STATE_ABBR=="HI"|state.dta$STATE_ABBR=="OR"|state.dta$STATE_ABBR=="WA"]<-1
table(state.dta$west)

##do they add to 1?
mean(state.dta$west, na.rm=TRUE) + mean(state.dta$south, na.rm=TRUE) + mean(state.dta$northeast, na.rm=TRUE) + mean(state.dta$midwest, na.rm=TRUE)

head(state.dta)

regions<-state.dta[,c("zipnew", "STATE","STATE_ABBR","northeast","midwest","south","west")]
head(regions)
dim(regions)

table(nchar(regions$zipnew))

save(regions,file="data/regions.Rdata")


##main data set

ls()
length(na.omit(zip$pctr082))

#load("~/Dropbox/geogaffect/data/gis/zcta/zip_apsa_imputed_final.Rdata")
table(nchar(zip$zipnew))
ls()

##how many unique zip codes?
length(zip$zipnew)
#census says total with puerto rico is 33,178

dim(zip)
zip<-merge(zip, regions, by="zipnew",all.x=TRUE)
dim(zip)
head(zip)
table(zip$northeast, exclude=NULL)
table(is.na(zip$northeast))
zip$miss.new<-is.na(zip$northeast)
table(zip$miss.new)
zip$zipnew[zip$miss.new==TRUE]
zip<-zip[zip$miss.new==FALSE, ] ##remove Virgin Islands, American Samoa, territories (24 observations)
table(zip$STATE_ABBR, exclude=NULL)
zip$miss.new<-is.na(zip$STATE_ABBR)
zip$zipnew[zip$miss.new==TRUE]##puerto rico
zip<-zip[zip$miss.new==FALSE, ] ##remove Virgin Islands, American Samoa, territories (24 observations)
table(zip$STATE_ABBR, exclude=NULL)


dim(zip)

table(nchar(zip$zipnew))

save(zip, file="data/zip_apsa_imputed22.Rdata")


####missing data analysis for top 50 MSAs

# # total<-NA
trim<-NA
topmsa<-NA
total<-NA

zip$msa<-as.character(zip$msa)

for(i in 1:length(unique(zip$msa) )  ){
	
total[i]<-length(zip$pctr082[zip$msa==unique(zip$msa)[i] &!is.na(zip$msa) & !is.na(zip$zipnew) ])	
trim[i]<-length(zip$pctr082[!is.na(zip$pctr082) & zip$msa==unique(zip$msa)[i] &!is.na(zip$msa) & !is.na(zip$zipnew)  ])	
topmsa[i]<-unique(zip$msa)[i]

}

names(total)<-names(trim)<-topmsa

missing<-cbind.data.frame( msa=topmsa, missing1=1-round(as.numeric(trim/total), digits=2) ) [-22,]

missing[order(missing$missing1),]

miss.msas<-missing$msa[missing$missing1>.45]

dim(zip)
zip<-zip[!(zip$msa%in%miss.msas),] ##drop msas with serious missing data issues
dim(zip)


miss2<-missing[missing$missing1<=.1,]
miss2$msa<-as.character(miss2$msa)


#write.csv(missing, file="~/Dropbox/geogaffect/data/gis/missing_data_msas.csv")


trim/total


# ##indicator for major msa with little missing data
zip$nomiss<-0
zip$nomiss[zip$msa%in% miss2$msa]<-1
table(zip$nomiss)
table(zip$msa[zip$nomiss==1])




##identify major msas
zip$pop<-as.numeric(zip$pop.zip.5yr.20122)
zip$msa<-as.character(zip$msa)
zip$sumpop<-NA

msa.sums<-  tapply(zip$pop, zip$msa, sum)
as.numeric(tapply(zip$pop, zip$msa, sum)["New York-Newark-Jersey City, NY-NJ-PA"])

for(i in 1:length(unique(zip$msa) )){
	
	zip$sumpop[zip$msa==unique(zip$msa)[i]]<-as.numeric(msa.sums[unique(zip$msa)[i]])
	
}




table(zip$sumpop[zip$msa=="New York-Newark-Jersey City, NY-NJ-PA"])
sum(zip$pop[zip$msa=="New York-Newark-Jersey City, NY-NJ-PA"], na.rm=TRUE) ##this worked



##identify top 50 msas
zip<-zip[order(zip$sumpop, decreasing=TRUE), ]
zip$rank<-rank(zip$sumpop)
zip$top50<-0
zip$top50[zip$rank%in%unique(zip$rank)[1:50]]<-1
table(zip$msa[zip$top50==1]) ##these are the 50 largest msas by population



zip$top25<-0
zip$top25[zip$rank%in%unique(zip$rank)[1:25]]<-1
table(zip$msa[zip$top25==1]) ##these are the 50 largest msas by population
table(zip$top25)


## Percent McCain in 2008
class(zip$pctr082)
summary(zip$pctr082)

##code bins for mccain vote share
##20 bins
zip$q20<-as.numeric(cut(zip$pctr082, breaks=20))
table(zip$q20)


##total housing units
class(zip$totunits2)
zip$housing<-as.numeric(zip$totunits2)
summary(zip$totunits2)
summary(zip$totunits)

summary(zip$housing)
all.units<-sum(zip$housing, na.rm=TRUE)
all.units

##calculate share of national housing unit in each zip
zip$unit.weight<-zip$housing/all.units
summary(zip$unit.weight)

##calculate sum of housing units within each MSA (this is the denominator for each MSA)
zip$sum.msa.house<-NA

msa.sum.house<-  tapply(zip$housing, zip$msa, sum)
as.numeric(tapply(zip$housing, zip$msa, sum)["New York-Newark-Jersey City, NY-NJ-PA"])
sum(zip$housing[zip$msa=="New York-Newark-Jersey City, NY-NJ-PA"], na.rm=TRUE)


for(i in 1:length(unique(zip$msa) )){
	
	zip$sum.msa.house[zip$msa==unique(zip$msa)[i]]<-as.numeric(msa.sum.house[unique(zip$msa)[i]])
	
}

table(zip$sum.msa.house[zip$msa=="New York-Newark-Jersey City, NY-NJ-PA"])##worked

##calculate share of each msa's housing contained in each zip
zip$share<-zip$housing/zip$sum.msa.house
sum(zip$share[zip$msa=="New York-Newark-Jersey City, NY-NJ-PA"], na.rm=TRUE) ##should equal 1



##education
zip$ba<-as.numeric(zip$pctba.5yr.20122)

##density
table(zip$popdens.5yr.20122)
zip$dens5k<-NA
zip$dens5k[zip$popdens.5yr.20122<=5000]<-0
zip$dens5k[zip$popdens.5yr.20122>5000]<-1
#table(zip$dens5k)

dim(zip)
table(nchar(zip$zipnew))


##missing data
dim(zip)
zip<-zip[!is.na(zip$pctr082) & !is.na(zip$zipnew) & !is.na(zip$ba) &  !is.na(zip$ownocc.val.q252) & !is.na(zip$popdens.5yr.20122 ) & !is.na(zip$pctowned.5yr.20122) &  !is.na(zip$top25) & !is.na(zip$south) & !is.na(zip$northeast) & !is.na(zip$west) & !is.na(zip$midwest), ]
dim(zip)


##within state pop and housing totals

pop.state<-NA
housing.state<-NA
state<-NA
length.state<-NA
length.state.all<-NA


for(i in 1:length(unique(zip$STATE_ABBR))){
	
	pop.state[i]<-sum(  zip$pop.zip.5yr.20122[zip$STATE_ABBR==unique(zip$STATE_ABBR)[i] ]  , na.rm=TRUE)
	housing.state[i]<-sum(zip$housing[zip$STATE_ABBR==unique(zip$STATE_ABBR)[i]], na.rm=TRUE)
   length.state[i]<-length(zip$zipnew[zip$STATE_ABBR==unique(zip$STATE_ABBR)[i]])
   length.state.all[i]<-length(regions$zipnew[regions$STATE_ABBR==unique(zip$STATE_ABBR)[i]])
	state[i]<-unique(zip$STATE_ABBR)[i]
	
}

miss.compare<-cbind.data.frame(state=state, pop=pop.state, housing=housing.state, N.ours=length.state, N.all=length.state.all)
miss.compare$pct.miss<-1 - (miss.compare$N.ours / miss.compare$N.all)
miss.compare<-miss.compare[order(miss.compare$pct.miss, decreasing=TRUE), ]
miss.compare

##percent of zip codes
## total ZCTA's is 33,178 (including puerto rico, source: https://www.census.gov/geo/reference/zctafaq.html)
sum(miss.compare$N.ours) ##report this

##total number of people in U.S. 
sum(miss.compare$pop)
sum(zip$pop.zip.5yr.20122, na.rm=TRUE)
sum(miss.compare$housing)
sum(zip$housing)

##true number in 2012 census

save(zip, file="data/zip_apsa_imputed22.Rdata")




####### ADD VARIABLE FOR INTER/INTRA MSA MOVE
# # ## re-load ssi data
load("data/dd.final.Rdata")

load("data/zip_apsa_imputed22.Rdata")
##merge in current and former msa for movers
msas.old<-cbind.data.frame(msa.old=zip$msa, zipnew=zip$zipnew)##using complete zip code data file
msas.new<-cbind.data.frame(msa.new=zip$msa, zipnew=zip$zipnew)



dd.final$zipnew<-dd.final$zip
dd.final$zipnew[dd.final$zipnew==""]<-NA
table(nchar(dd.final$zipnew))


table(nchar(dd.final$lastzip.text))##previous zip code
table(nchar(dd.final$zipnew))##current zip code
table(nchar(zip$zipnew))
names(zip)

head(dd.final)


dim(dd.final)
dd.final<-merge(dd.final, msas.new, by="zipnew", all.x=T)##merge by current zip code
dd.final<-merge(dd.final, msas.old, by.x="lastzip.text",by.y="zipnew", all.x=T)##merge by current zip code
dim(dd.final)
head(dd.final)

##what percentage of movers stayed in the same MSA
table(!is.na(dd.final$lastzip.text))
dd.final$same.msa<-NA
dd.final$same.msa[!is.na(dd.final$lastzip.text) & dd.final$msa.old==dd.final$msa.new]<-1
dd.final$same.msa[!is.na(dd.final$lastzip.text) & dd.final$msa.old!=dd.final$msa.new]<-0
table(is.na(dd.final$lastzip.text))
table(dd.final$same.msa, exclude=NULL)
mean(dd.final$same.msa, na.rm=T)



##re-save final data
save(dd.final, file="data/dd.final.Rdata")

























#############################################
#############################################
#############################################
#############################################


##### BUILD ESTIMATION DATA FOR CONJOINT ANALYSIS


#############################################
#############################################
#############################################
#############################################


rm(list=ls()[ls()!="wd"])
as.n<-as.numeric
as.c<-as.character
as.f<-as.formula
len<-length


load("data/dd.final.Rdata")
dd<-dd.final
dim(dd)





##Organize randomized data

qstem<-"Q3"
stimvars<-grep(paste(qstem, ".L.F", sep=""), names(dd))
labvars<-grep(paste(qstem, ".F.F", sep=""), names(dd))
stim<-dd[,stimvars]
trtlvlnumbs<-sapply(strsplit(names(stim), "_"), function(x) x[2])##extract the treatment number from the name of each .L.F. variable
rands<-dd[,grep(paste(qstem, ".F.F", sep=""), names(dd))] 
trtindx<-as.n(sapply(strsplit(trtlvlnumbs, "\\."), function(x) x[1]))##extract item number from .L.F questions
lvlindx<-as.n(sapply(strsplit(trtlvlnumbs, "\\."), function(x) x[2])) ##extract item number from .F.F. questions
unrand<-apply(rands, 1, order)##obtain alphabetical order of 7 sets of items for each respondent
newstim<-stim
  
##newstim will be a matrix of the unrandomized responses  
  for(i in 1:nrow(stim)){
    for(j in 1:max(trtindx)){
    newstim[i,grep(paste(qstem, ".L.F_", j, sep=""), names(newstim))]<-stim[i,grep(paste(qstem, ".L.F_", unrand[j,i], sep=""), names(stim))]
    }
  }

dim(newstim)
names(newstim)

##subtract out all the 11-14 items (people only saw 1-10)
  
extra.stims<-c("Q3.L.F_1.11", "Q3.L.F_1.12","Q3.L.F_1.13","Q3.L.F_1.14","Q3.L.F_2.11","Q3.L.F_2.12","Q3.L.F_2.13","Q3.L.F_2.14","Q3.L.F_3.11","Q3.L.F_3.12","Q3.L.F_3.13","Q3.L.F_3.14","Q3.L.F_4.11","Q3.L.F_4.12","Q3.L.F_4.13","Q3.L.F_4.14","Q3.L.F_5.11","Q3.L.F_5.12","Q3.L.F_5.13","Q3.L.F_5.14","Q3.L.F_6.11","Q3.L.F_6.12","Q3.L.F_6.13","Q3.L.F_6.14","Q3.L.F_7.11","Q3.L.F_7.12","Q3.L.F_7.13","Q3.L.F_7.14")


newstim<-newstim[,!(names(newstim) %in% extra.stims)]
names(newstim)
conj.data<-newstim


##merge in respondent ID
conj.data$respid<-dd$respid
###spot check some cases to make sure everything worked
conj.data$respid[59] ##respid for 59th respondent is R_9B0gkuqApbJ7byR
conj.data[59,1:10] ##house data for 59th respodent is...
##house items for person R_9B0gkuqApbJ7byR should match:
#40 percent of pre-tax income 
#15 percent of pre-tax income 
#30 percent of pre-tax income 
#40 percent of pre-tax income 
#30 percent of pre-tax income 
#30 percent of pre-tax income 
#30 percent of pre-tax income 
#30 percent of pre-tax income 
#15 percent of pre-tax income 
#40 percent of pre-tax income 
##it all matches....now check a new person, new variable
conj.data$respid[3031] ##respid for 3031th respondent is R_9B0gkuqApbJ7byR
conj.data[3031,61:70] ##crime data for 3031th respodent is...
##crime data should match this....
#20% Less Crime Than National Average
#20% More Crime Than National Average
#20% Less Crime Than National Average
#20% Less Crime Than National Average
#20% More Crime Than National Average
#20% More Crime Than National Average
#20% Less Crime Than National Average
#20% More Crime Than National Average
#20% More Crime Than National Average
#20% Less Crime Than National Average
###this all matches

####
##reshape the data for estimation
####

##geneerate task indicator
task<-c(rep(1,dim(conj.data)[1]),rep(1,dim(conj.data)[1]),rep(2,dim(conj.data)[1]),rep(2,dim(conj.data)[1]),rep(3,dim(conj.data)[1]),rep(3,dim(conj.data)[1]),rep(4,dim(conj.data)[1]),rep(4,dim(conj.data)[1]),rep(5,dim(conj.data)[1]),rep(5,dim(conj.data)[1]) )


conj.house<-c((conj.data[, grep("pre-tax income", conj.data) ])[,1],(conj.data[, grep("pre-tax income", conj.data) ])[,2],(conj.data[, grep("pre-tax income", conj.data) ])[,3],(conj.data[, grep("pre-tax income", conj.data) ])[,4],(conj.data[, grep("pre-tax income", conj.data) ])[,5],(conj.data[, grep("pre-tax income", conj.data) ])[,6],(conj.data[, grep("pre-tax income", conj.data) ])[,7],(conj.data[, grep("pre-tax income", conj.data) ])[,8],(conj.data[, grep("pre-tax income", conj.data) ])[,9],(conj.data[, grep("pre-tax income", conj.data) ])[,10])
len(conj.house)

conj.pres<-c((conj.data[, grep("Democrat", conj.data) ])[,1],(conj.data[, grep("Democrat", conj.data) ])[,2],(conj.data[, grep("Democrat", conj.data) ])[,3],(conj.data[, grep("Democrat", conj.data) ])[,4],(conj.data[, grep("Democrat", conj.data) ])[,5],(conj.data[, grep("Democrat", conj.data) ])[,6],(conj.data[, grep("Democrat", conj.data) ])[,7],(conj.data[, grep("Democrat", conj.data) ])[,8],(conj.data[, grep("Democrat", conj.data) ])[,9],(conj.data[, grep("Democrat", conj.data) ])[,10])
len(conj.pres)

conj.race<-c((conj.data[, grep("Nonwhite", conj.data) ])[,1],(conj.data[, grep("Nonwhite", conj.data) ])[,2],(conj.data[, grep("Nonwhite", conj.data) ])[,3],(conj.data[, grep("Nonwhite", conj.data) ])[,4],(conj.data[, grep("Nonwhite", conj.data) ])[,5],(conj.data[, grep("Nonwhite", conj.data) ])[,6],(conj.data[, grep("Nonwhite", conj.data) ])[,7],(conj.data[, grep("Nonwhite", conj.data) ])[,8],(conj.data[, grep("Nonwhite", conj.data) ])[,9],(conj.data[, grep("Nonwhite", conj.data) ])[,10] )
len(conj.race)


conj.school<-c((conj.data[, grep("Q3.L.F_4", names(conj.data)) ])[,1],(conj.data[, grep("Q3.L.F_4", names(conj.data)) ])[,2],(conj.data[, grep("Q3.L.F_4", names(conj.data)) ])[,3],(conj.data[, grep("Q3.L.F_4", names(conj.data)) ])[,4],(conj.data[, grep("Q3.L.F_4", names(conj.data)) ])[,5],(conj.data[, grep("Q3.L.F_4", names(conj.data)) ])[,6],(conj.data[, grep("Q3.L.F_4", names(conj.data)) ])[,7],(conj.data[, grep("Q3.L.F_4", names(conj.data)) ])[,8],(conj.data[, grep("Q3.L.F_4", names(conj.data)) ])[,9],(conj.data[, grep("Q3.L.F_4", names(conj.data)) ])[,10] )
len(conj.school)

conj.commute<-c((conj.data[, grep("min", conj.data) ])[,1],(conj.data[, grep("min", conj.data) ])[,2],(conj.data[, grep("min", conj.data) ])[,3],(conj.data[, grep("min", conj.data) ])[,4],(conj.data[, grep("min", conj.data) ])[,5],(conj.data[, grep("min", conj.data) ])[,6],(conj.data[, grep("min", conj.data) ])[,7],(conj.data[, grep("min", conj.data) ])[,8],(conj.data[, grep("min", conj.data) ])[,9],(conj.data[, grep("min", conj.data) ])[,10]  )
len(conj.commute)


conj.place<-c((conj.data[, grep("Q3.L.F_6", names(conj.data)) ])[,1],(conj.data[, grep("Q3.L.F_6", names(conj.data)) ])[,2],(conj.data[, grep("Q3.L.F_6", names(conj.data)) ])[,3],(conj.data[, grep("Q3.L.F_6", names(conj.data)) ])[,4],(conj.data[, grep("Q3.L.F_6", names(conj.data)) ])[,5],(conj.data[, grep("Q3.L.F_6", names(conj.data)) ])[,6],(conj.data[, grep("Q3.L.F_6", names(conj.data)) ])[,7],(conj.data[, grep("Q3.L.F_6", names(conj.data)) ])[,8],(conj.data[, grep("Q3.L.F_6", names(conj.data)) ])[,9],(conj.data[, grep("Q3.L.F_6", names(conj.data)) ])[,10])
len(conj.place)

conj.crime<-c((conj.data[, grep("Crime", conj.data) ])[,1],(conj.data[, grep("Crime", conj.data) ])[,2],(conj.data[, grep("Crime", conj.data) ])[,3],(conj.data[, grep("Crime", conj.data) ])[,4],(conj.data[, grep("Crime", conj.data) ])[,5],(conj.data[, grep("Crime", conj.data) ])[,6],(conj.data[, grep("Crime", conj.data) ])[,7],(conj.data[, grep("Crime", conj.data) ])[,8],(conj.data[, grep("Crime", conj.data) ])[,9],(conj.data[, grep("Crime", conj.data) ])[,10]  )
len(conj.crime)

   
##combine all the data
conj.data.new<-cbind.data.frame(task, conj.house,conj.pres, conj.race, conj.school, conj.commute, conj.crime, conj.place)
dim(conj.data.new )   
head(conj.data.new)    

   
###generate the response variable for choosing community B, which is just the inverse of the current response variable for choosing community A

dd$conj.resp.1.2<-NA
dd$conj.resp.1.2[dd$traitA.conj1==1 ]<-0
dd$conj.resp.1.2[dd$traitA.conj1==0 ]<-1
dd$conj.resp.2.2<-NA
dd$conj.resp.2.2[dd$traitA.conj2==1 ]<-0
dd$conj.resp.2.2[dd$traitA.conj2==0 ]<-1
dd$conj.resp.3.2<-NA
dd$conj.resp.3.2[dd$traitA.conj3==1 ]<-0
dd$conj.resp.3.2[dd$traitA.conj3==0 ]<-1
dd$conj.resp.4.2<-NA
dd$conj.resp.4.2[dd$traitA.conj4==1 ]<-0
dd$conj.resp.4.2[dd$traitA.conj4==0 ]<-1
dd$conj.resp.5.2<-NA
dd$conj.resp.5.2[dd$traitA.conj5==1 ]<-0
dd$conj.resp.5.2[dd$traitA.conj5==0 ]<-1



##assemble the total response variable for both A and B
conj.response<-c(dd$traitA.conj1, dd$conj.resp.1.2, dd$traitA.conj2, dd$conj.resp.2.2, dd$traitA.conj3, dd$conj.resp.3.2, dd$traitA.conj4, dd$conj.resp.4.2, dd$traitA.conj5, dd$conj.resp.5.2)
conj.response<-as.n(as.c(conj.response))##make response variable numeric
len(conj.response)
table(conj.response)##there should be equal numbers of 0's and 1's since this variable is a vector of duplicates. For the row where person i saw community A and chose community A, the response variable gets a 1. For the row where person i saw community B and chose community A, the response variable gets a 0.
 

##gen profile indicator
conj.profile<-c(rep("A",len(dd$traitA.conj1)),rep("B",len(dd$traitA.conj1)), rep("A",len(dd$traitA.conj1)), rep("B",len(dd$traitA.conj1)),rep("A",len(dd$traitA.conj1)),rep("B",len(dd$traitA.conj1)), rep("A",len(dd$traitA.conj1)), rep("B",len(dd$traitA.conj1)), rep("A",len(dd$traitA.conj1)), rep("B",len(dd$traitA.conj1)))
len(conj.profile)



##gen respid variable
respid<-rep(conj.data$respid,10)
##gen party variable
partywlean<- rep(dd$partywlean, 10)
##gen subset vars
white_race<-rep(dd$white2,10)
female<-rep(dd$female, 10)
age<-rep(dd$age, 10)
income<-rep(dd$income.num,10)
education<-rep(dd$educ.years, 10)
black<-rep(dd$black2, 10)
libdem<-rep(dd$libdem,10)
conrep<-rep(dd$conrep,10)
rich<-rep(dd$rich, 10)
old<-rep(dd$old, 10)
ownrent<-rep(dd$ownrent2, 10)
educ<-rep(dd$educ2, 10)
homelast5<-rep(dd$homelast5,10)
kidshome<-rep(dd$kidshome,10)
city<-rep(dd$city,10)
black.therm<-rep(dd$black.therm,10)
black.hisp.therm<-rep(dd$black.hisp.therm,10)
white.black.diff<-rep(dd$white.black.diff,10)
working<-rep(dd$working, 10)
retired<-rep(dd$retired, 10)
single<-rep(dd$single, 10)
white.dem<-rep(dd$white.dem, 10)
same.msa<-rep(dd$same.msa,10)

##add all the data together to form the estimation data set
conj.data.party<-cbind.data.frame(partywlean=partywlean, respid=respid,white_race=white_race,black=black, female=female, age=age, income=income,   libdem=libdem, conrep=conrep, rich=rich, old=old, ownrent=ownrent, educ=educ, education=education, homelast5=homelast5,kidshome=kidshome, city=city, black.therm=black.therm, black.hisp.therm=black.hisp.therm, white.black.diff=white.black.diff,  working=working, retired=retired, single=single, white.dem=white.dem, same.msa=same.msa, task=task, profile=conj.profile, conj.data.new,response=conj.response)
conj.data.party$partywlean<-as.c(conj.data.party$partywlean)##make party variable a character
conj.data.party$respid<-as.c(conj.data.party$respid)##make respid var. a character
dim(conj.data.party)
head(conj.data.party)


##check that this worked. for a few random respondents
##respondent R_6My4tmgvrmYQCVf on the THIRD conjoint question, Q20.8, saw the following two scenarios according to the csv file:
##Community A
#commute time = 75 mins
#race = 75% white
#house = 15% of income
#place = rural area
#school = 9
#crime = 20% more
#pres = 30% dem

##Community B
#commute time = 10 mins
#race = 50% white
#house = 30% of income
#place = susburban nabe with mix
#school = 5
#crime = 20% more
#pres = 50% dem

##choice = community B

##view all responses/scenarios by this respondent, make sure the 5th and 6th rows match the values above
conj.data.test<-conj.data.party[conj.data.party$respid=="R_6My4tmgvrmYQCVf",]
conj.data.test[5:6,]


##this all checks out

##make the scenario columns character vectors
conj.data.party[,grep("conj",names(conj.data.party))]<-apply(conj.data.party[,grep("conj",names(conj.data.party))], 2, as.c)
head(conj.data.party)
class(conj.data.party$conj.race)



####recode values of factor variables to remove html junk

trait.cols<-grep("conj", names(conj.data.party))

dim(conj.data.party)
for (i in 1:len(trait.cols)){
conj.data.party[,trait.cols[i]]<-gsub("%20"," ",conj.data.party[,trait.cols[i]])
conj.data.party[,trait.cols[i]]<-gsub("%E2%80%93","–",conj.data.party[,trait.cols[i]])
}
dim(conj.data.party)

file.name<-paste(wd, "data","/conj.data.party.Rdata",sep="")
save(conj.data.party, file=file.name)


























#############################################
#############################################
#############################################
#############################################


##### BUILD ESTIMATION DATA FOR PAIRED COMPARISONS ANALYSIS


#############################################
#############################################
#############################################
#############################################


rm(list=ls()[ls()!="wd"])


as.n<-as.numeric
as.c<-as.character
as.f<-as.formula
len<-length

load("data/dd.final.Rdata")
dd<-dd.final
dim(dd)



##reshape the data

##make pair response variables numeric

pair.data<-cbind.data.frame(dd$pair.1, dd$pair.2, dd$pair.3, dd$pair.4,dd$pair.5, dd$pair.6, dd$pair.7, dd$pair.8, dd$pair.9)

for (i in 1:ncol(pair.data)){
pair.data[,i]<-as.n(as.c(pair.data[,i]))
}

pair.orders<-cbind.data.frame(dd$pair.1.order, dd$pair.2.order, dd$pair.3.order, dd$pair.4.order, dd$pair.5.order, dd$pair.6.order, dd$pair.7.order, dd$pair.8.order, dd$pair.9.order)

##make list of data frames to store new versions
df.unsplit<-list(NA)

for (i in 1:ncol(pair.data)){
mat.unsplit<-as.data.frame(matrix(, nrow=len(pair.data[,1]), ncol=3))
df.unsplit[[i]]<-assign(paste("pair.",i,".order.unsplit",sep=""), mat.unsplit)
}

###
##break apart order variables, which display the three items a respondent saw in the order they saw them. three for each question
###

for (j in 1:ncol(pair.orders)){
pair.unsplit<-str_split(pair.orders[,j], "\\|") 
for(i in 1:len(pair.unsplit)){
if(pair.unsplit[[i]]==""){pair.unsplit[[i]]<-c("NA" ,"NA" ,"NA")}
else{ pair.unsplit[[i]]<-pair.unsplit[[i]] }
}
mat.df  <- as.data.frame(matrix(unlist(pair.unsplit), ncol=3, byrow=TRUE))
mat.1<-as.n(as.c(gsub("NA",NA,mat.df[,1])))
mat.2<-as.n(as.c(gsub("NA",NA,mat.df[,2])))
mat.3<-as.n(as.c(gsub("NA",NA,mat.df[,3])))
mat.df<-cbind.data.frame(mat.1, mat.2, mat.3)
colnames(mat.df)<-c(paste("pair.",j,".1", sep="" ),paste("pair.",j,".2", sep="" ), paste("pair.",j,".3", sep="" ))
df.unsplit[[j]]<-mat.df
}


df.unsplit[[1]][1:50,]

##merge reshaped variables back into original data frame
dd<-cbind.data.frame(dd, df.unsplit)
names(dd)

##check this
dd$pair.1.order[1:50]
dd$pair.1.1[1:50]
dd$pair.1.2[1:50]
dd$pair.1.3[1:50]


##generate matrices to store values indicating whether a respondent saw a certain item within each pair and scoring them if they did; NA otherwise

##make list of data frames to store new versions
pair.means.combined<-list(NA)


##generate a list of 9 empty response matrices (all NAs)
for (i in 1:ncol(pair.data)){
mat.mean<-matrix(nrow=len(dd$pair.1),ncol=64)
pair.means.combined[[i]]<-assign(paste("pair.",i,".means",sep=""), mat.mean)
}


##populate the 9 response matrices with the winning/losing response values
for (k in 1:ncol(pair.data)){
for (j in 1:64){
for (i in 1:len(pair.data[,1])){
	##assign the 1's (winning items)
	if (pair.data[i,k]==j & !is.na(pair.data[i,k]) ){pair.means.combined[[k]][i,j]<-1}
	#assign the 0's (losing items)
	if (pair.data[i,k]!=j & !is.na(pair.data[i,k]) 
	& (df.unsplit[[k]][i,1]==j|df.unsplit[[k]][i,2]==j|df.unsplit[[k]][i,3]==j)==TRUE  
	& !is.na(df.unsplit[[k]][i,1]) & !is.na(df.unsplit[[k]][i,2]) & !is.na(df.unsplit[[k]][i,3]))
	{pair.means.combined[[k]][i,j]<-0}
	}}}

##check that it worked (need to write new checks)
##chose 45 over 19 and 61
dd$pair.4[dd$respid=="R_3sFMXxWiYoXBKsd"]==45 
row<-which(dd$respid=="R_3sFMXxWiYoXBKsd")
pair.means.combined[[4]][row,45]
pair.means.combined[[4]][row,19]
pair.means.combined[[4]][row,61]


##pair.2 R_a9600CLlX80v4Mt...respondent chose 61 over 9 and 30
dd$pair.2[dd$respid=="R_a9600CLlX80v4Mt"]==61
row<-which(dd$respid=="R_a9600CLlX80v4Mt")
pair.means.combined[[2]][row,9]
pair.means.combined[[2]][row,30]
pair.means.combined[[2]][row,61]

##checks out


####!!!!! REMOVE COLUMN 63. This is a blank column because there was no item #63. After removal response item #64 will be contained in Column 63
for (i in 1:ncol(pair.data)){
	pair.means.combined[[i]]<-pair.means.combined[[i]][, -63]
	}



##add subsetting variables indices to all the #matrices
sort.vars<-cbind.data.frame(partywlean=dd$partywlean, white=dd$white2,black=dd$black2,  female=dd$female, age=dd$age, education=dd$educ.years, income=dd$income.num,  libdem=dd$libdem, conrep=dd$conrep, rich=dd$rich, old=dd$old, ownrent=dd$ownrent2, educ=dd$educ2, homelast5=dd$homelast5,kidshome=dd$kidshome, city=dd$city, black.therm=dd$black.therm,black.hisp.therm=dd$black.hisp.therm, white.black.diff=dd$white.black.diff, young1=dd$young1,young2=dd$young2, young3=dd$young3, working=dd$working, retired=dd$retired, single=dd$single, white.dem=dd$white.dem, same.msa=dd$same.msa,respid=dd$respid)



for (i in 1:ncol(pair.data)){
	pair.means.combined[[i]]<-cbind.data.frame(pair.means.combined[[i]], sort.vars)
	}


##stack all the matrices into estimation dataframe
pair.means.all<-rbind.data.frame(pair.means.combined[[1]], pair.means.combined[[2]], pair.means.combined[[3]],pair.means.combined[[4]],pair.means.combined[[5]],pair.means.combined[[6]],pair.means.combined[[7]],pair.means.combined[[8]],pair.means.combined[[9]]
)
dim(pair.means.all)

quickfire.data.final<-pair.means.all


file.name=paste(wd, "data/quickfire.data.final.Rdata", sep="")
save(quickfire.data.final, file=file.name)









#################################################
#################################################
#################################################
#################################################
############# NEIGHBORHOOD QUALITY PROXY ANALYSIS
#################################################
#################################################
#################################################
#################################################


##make sure to get permission from ICPSR before redistributing in replication file...see tmers of use

#source for crime data: http://www.icpsr.umich.edu/icpsrweb/DSDR/studies/35019
load("data/crime/ICPSR_35019/DS0004/35019-0004-Data.rda")

##UCR crimes reported per county, 2012 

d<-da35019.0004 ##rename data frame

head(d)


## 

##total county population of agencies reporting crimes
class(d$CPOPCRIM)


##total violent crimes reported in county
class(d$VIOL)


##generate violdent crimes per capita

d$vpc<-d$VIOL / d$CPOPCRIM
d$vpc[d$vpc==Inf]<-NA
summary(d$vpc)


table(d$FIPS_ST)

##format state fips code
d$FIPS_ST<-as.character(d$FIPS_ST)
table(nchar(d$FIPS_ST))
for(i in 1:nrow(d)){
	
	if(nchar(d$FIPS_ST[i] )==1 ){ d$FIPS_ST[i]<-paste("0",d$FIPS_ST[i],sep="")  }
	
}
table(nchar(d$FIPS_ST))

##format county fips code
d$FIPS_CTY<-as.character(d$FIPS_CTY)
table(nchar(d$FIPS_CTY))

for(i in 1:nrow(d)){
	
	if(nchar(d$FIPS_CTY[i] )==1 ){ d$FIPS_CTY[i]<-paste("00",d$FIPS_CTY[i],sep="")  }
	if(nchar(d$FIPS_CTY[i] )==2 ){ d$FIPS_CTY[i]<-paste("0",d$FIPS_CTY[i],sep="")  }
	
}
table(nchar(d$FIPS_CTY))

##generate 5-digit fips code

d$fips<-paste(d$FIPS_ST, d$FIPS_CTY, sep="")
head(d$fips)
table(nchar(d$fips))
length(unique(d$fips))##3178 unique fips codes



############### EDUCATION DATA

###percent in county with BA

##see meta data file in the same folder for codebook

ed<-read.csv("data/crime/county_education_2012ACS/acs_ed_2012.csv", header=T)
head(ed)

##total bachelor's or higher in percents

##of those 25 and over
summary(ed$HC01_EST_VC17)
ed$pctba<-ed$HC01_EST_VC17
summary(ed$pctba)
head(ed$pctba)

##population 18-24
ed$pop_18_24<-ed$HC01_EST_VC01

##population 25 and over
ed$pop_25_over<-ed$HC01_EST_VC07
head(ed$pop_25_over)

##total pop
ed$pop_total<-ed$pop_18_24+ed$pop_25_over

##back out raw number of degrees
ed$ba_raw<-(ed$pctba/100)*ed$pop_25_over
head(ed$ba_raw)

##county name
ed$county_name<-as.character(ed$GEO.display.label)

###county fips codes
ed$fips<-as.character(ed$GEO.id2)
head(ed)
table(nchar(ed$fips))
for(i in 1:nrow(ed)){
	
	if(nchar(ed$fips[i])==4){ed$fips[i]<-paste("0",ed$fips[i],sep="")}
	
}
table(nchar(ed$fips))

ba<-ed[,c("fips","county_name","pctba","ba_raw")]

head(ba)
length(unique(ed$fips)) ##3143 unique fips codes


########### OWNER OCCUPIED HOUSING DATA


###percent owner-occupied housing in county 

##see meta data file in the same folder for codebook

own<-read.csv("data/crime/county_ownocc_2012ACS/acs_ownocc_2012.csv", header=T)
head(own)


##total occupied housing

summary(own$HC01_EST_VC01)
own$total_house_units<-own$HC01_EST_VC01

##total owner occupied housing units

summary(own$HC02_EST_VC01)
own$own.occ<-own$HC02_EST_VC01

own$pctown<-own$own.occ /own$total_house_units
summary(own$pctown)

##county name
own$county_name<-as.character(own$GEO.display.label)

###county fips codes
own$fips<-as.character(own$GEO.id2)
head(own)
table(nchar(own$fips))
for(i in 1:nrow(own)){
	
	if(nchar(own$fips[i])==4){own$fips[i]<-paste("0",own$fips[i],sep="")}
	
}
table(nchar(own$fips))

own<-own[,c("fips","county_name","pctown","total_house_units")]

head(own)
length(unique(own$fips)) ##3143 unique fips codes



###merge the three data sets together

dim(d)
dim(ba)
dim(own)
qual<-merge(d, ba, by="fips")
dim(qual)
qual<-merge(qual, own, by="fips")
head(qual)
dim(qual)

length(unique(qual$fips))##3137 unique fips codes, same as rows in data
table(qual$county_name.x==qual$county_name.y)
save(qual, file="data/qual.proxies.Rdata")












## Community preference design
####
####
###COMM PREF items: create single response vector (0's and 1's) and accompanying treatment factor vector
#####
####

##first need to ID the treatment arms. There are 9, each with roughly 600 people; 1339 missing
table(dd$comm.pref.arm)
## 1st response variable, dichotomous

##test to see everyone only got 1 of 9 conditions
dd$comm.resp.test<-paste(dd$rp.will.prefa1,dd$rp.will.prefa2,dd$rp.will.prefa3, dd$rp.will.prefa4,dd$rp.will.prefa5,dd$rp.will.prefa6,dd$rp.will.prefa7,dd$rp.will.prefa8,dd$rp.will.prefa9, sep="")

table(dd$comm.resp.test)
dd$comm.resp<-as.n(as.c(dd$comm.resp.test))
table(dd$comm.resp)

##second answer to comm pref items
dd$comm.resp2.test<-paste(dd$rp.attr.prefa1,dd$rp.attr.prefa2,dd$rp.attr.prefa3, dd$rp.attr.prefa4,dd$rp.attr.prefa5,dd$rp.attr.prefa6,dd$rp.attr.prefa7,dd$rp.attr.prefa8,dd$rp.attr.prefa9, sep="")

table(dd$comm.resp2.test) 

dd$comm.resp2<-as.n(as.c(dd$comm.resp2.test))









#######################################
#####
### Community Preference Factorial Experiment 
#####
#######################################

load("data/dd.final.Rdata")
dd<-dd.final
dim(dd)


## Community preference design
####
####
###COMM PREF items: create single response vector (0's and 1's) and accompanying treatment factor vector
#####
####

##first need to ID the treatment arms. There are 9, each with roughly 600 people; 1339 missing
table(dd$comm.pref.arm)
## 1st response variable, dichotomous

##test to see everyone only got 1 of 9 conditions
dd$comm.resp.test<-paste(dd$rp.will.prefa1,dd$rp.will.prefa2,dd$rp.will.prefa3, dd$rp.will.prefa4,dd$rp.will.prefa5,dd$rp.will.prefa6,dd$rp.will.prefa7,dd$rp.will.prefa8,dd$rp.will.prefa9, sep="")

table(dd$comm.resp.test)
dd$comm.resp<-as.n(as.c(dd$comm.resp.test))
table(dd$comm.resp)

##second answer to comm pref items
dd$comm.resp2.test<-paste(dd$rp.attr.prefa1,dd$rp.attr.prefa2,dd$rp.attr.prefa3, dd$rp.attr.prefa4,dd$rp.attr.prefa5,dd$rp.attr.prefa6,dd$rp.attr.prefa7,dd$rp.attr.prefa8,dd$rp.attr.prefa9, sep="")

table(dd$comm.resp2.test) 

dd$comm.resp2<-as.n(as.c(dd$comm.resp2.test))


#####
#####
### MEANS for Comm. Pref. 
#####
#####

###
comm.treat.arms<-c("Comm Pref A1","Comm Pref A2","Comm Pref A3","Comm Pref A4","Comm Pref A5","Comm Pref A6","Comm Pref A7","Comm Pref A8","Comm Pref A9")
comm.treat.arms2<-c("Control/Control","Control/Dem","Control/Rep","White/Rep","White/Control","White/Dem","Minority/Rep","Minority/Control","Minority/Dem")

comm.results.combined<-list(NA)

comm.results.data<-list(all=dd, dems=subset(dd, dd$partywlean=="dem"), reps=subset(dd, dd$partywlean=="rep"))

##COMM PREF ITEMS - whole sample
for (j in 1:3){
t.mean<-NA
t.lb<-NA
t.ub<-NA
t.stat<-NA
t2.mean<-NA
t2.lb<-NA
t2.ub<-NA
t2.stat<-NA
for (i in 1:len(comm.treat.arms)){
	t1<-t.test((comm.results.data[[j]]$comm.resp[comm.results.data[[j]]$comm.pref.arm==comm.treat.arms[i]]), na.rm=TRUE )
	t2<-t.test((comm.results.data[[j]]$comm.resp2[comm.results.data[[j]]$comm.pref.arm==comm.treat.arms[i]]), na.rm=TRUE )
	t.mean[i]<-as.n(t1$estimate)
    t.lb[i] <-t1$conf.int[1]
	t.ub[i] <-t1$conf.int[2]
    t.stat[i] <-as.n(t1$statistic)
    t2.mean[i]<-as.n(t2$estimate)
    t2.lb[i] <-t2$conf.int[1]
	t2.ub[i] <-t2$conf.int[2]
    t2.stat[i] <-as.n(t2$statistic)
    treat.index<-c(1:9)
    }
if(j==1){
	label<-rep("All",9)
	all.1<-cbind.data.frame(label, treat.index,comm.treat.arms2, t.mean, t.lb, t.ub, t2.mean, t2.lb, t2.ub)
	colnames(all.1)<-c("Sample","Treat Index","TreatArm","Mean1", "LB95", "UB95","Mean 2", "LB95.2", "UB95.2" )
		comm.results.combined[[j]]<-all}
if(j==2){label<-rep("Dem",9)
	dems<-cbind.data.frame(label, treat.index,comm.treat.arms2, t.mean, t.lb, t.ub, t2.mean, t2.lb, t2.ub)
	colnames(dems)<-c("Sample","Treat Index","TreatArm","Mean1", "LB95", "UB95","Mean 2", "LB95.2", "UB95.2" ) 
	comm.results.combined[[j]]<-dems}
if(j==3){label<-rep("Rep",9)
	reps<-cbind.data.frame(label, treat.index,comm.treat.arms2, t.mean, t.lb, t.ub, t2.mean, t2.lb, t2.ub)
	colnames(reps)<-c("Sample","Treat Index","TreatArm","Mean1", "LB95", "UB95","Mean 2", "LB95.2", "UB95.2")  
	comm.results.combined[[j]]<-reps}
	}

##save the sorted table 
file.name<-paste(wd, "output/", "comm.pref.results.Rdata",sep="")
save(comm.results.combined, file=file.name,row.names=T)



##order results for graphing
comm.results.combined[[2]]$sort.num<-seq(2,2*len(comm.results.combined[[2]][,1]), by=2)
comm.results.combined[[3]]$sort.num<-seq(1,2*len(comm.results.combined[[2]][,1])-1, by=2)
graph.data<-rbind.data.frame(comm.results.combined[[2]], comm.results.combined[[3]])
graph.data<-graph.data[order(graph.data$sort.num), ]
graph.data

##
##NO party info
##
graph.data$new.order<-NA
graph.data$new.order[graph.data$TreatArm=="Control/Control" & graph.data$Sample=="Rep"]<-1
graph.data$new.order[graph.data$TreatArm=="Control/Control" & graph.data$Sample=="Dem"]<-2
graph.data$new.order[graph.data$TreatArm=="Minority/Control" & graph.data$Sample=="Rep"]<-3
graph.data$new.order[graph.data$TreatArm=="Minority/Control" & graph.data$Sample=="Dem"]<-4
graph.data$new.order[graph.data$TreatArm=="White/Control" & graph.data$Sample=="Rep"]<-5
graph.data$new.order[graph.data$TreatArm=="White/Control" & graph.data$Sample=="Dem"]<-6

###
##OWN PARTY
###
##no race/own party
graph.data$new.order[graph.data$TreatArm=="Control/Rep" & graph.data$Sample=="Rep"]<-7
graph.data$new.order[graph.data$TreatArm=="Control/Dem" & graph.data$Sample=="Dem"]<-8
##70% white/own party
graph.data$new.order[graph.data$TreatArm=="Minority/Rep" & graph.data$Sample=="Rep"]<-9
graph.data$new.order[graph.data$TreatArm=="Minority/Dem" & graph.data$Sample=="Dem"]<-10
##96% white/own party
graph.data$new.order[graph.data$TreatArm=="White/Rep" & graph.data$Sample=="Rep"]<-11
graph.data$new.order[graph.data$TreatArm=="White/Dem" & graph.data$Sample=="Dem"]<-12

###
## OUT PARTY
##

#none/out party
graph.data$new.order[graph.data$TreatArm=="Control/Dem" & graph.data$Sample=="Rep"]<-13
graph.data$new.order[graph.data$TreatArm=="Control/Rep" & graph.data$Sample=="Dem"]<-14
##minority/out party
graph.data$new.order[graph.data$TreatArm=="Minority/Dem" & graph.data$Sample=="Rep"]<-15
graph.data$new.order[graph.data$TreatArm=="Minority/Rep" & graph.data$Sample=="Dem"]<-16
#white/out-party
graph.data$new.order[graph.data$TreatArm=="White/Dem" & graph.data$Sample=="Rep"]<-17
graph.data$new.order[graph.data$TreatArm=="White/Rep" & graph.data$Sample=="Dem"]<-18

graph.data

graph.data<-graph.data[order(graph.data$new.order), ]
comm.graph.data<-graph.data

file.name<-paste(wd, "output/", "comm.graph.data.Rdata",sep="")
save(comm.graph.data, file=file.name,row.names=T)

